diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
commit | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch) | |
tree | 4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/X86 | |
parent | 7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff) |
Notes
Diffstat (limited to 'llvm/lib/Target/X86')
75 files changed, 9120 insertions, 5726 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 25be79ec2b1e..d37d812df485 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -134,7 +134,6 @@ private: IOK_LENGTH, IOK_SIZE, IOK_TYPE, - IOK_OFFSET }; class InfixCalculator { @@ -326,6 +325,7 @@ private: IES_RSHIFT, IES_PLUS, IES_MINUS, + IES_OFFSET, IES_NOT, IES_MULTIPLY, IES_DIVIDE, @@ -350,16 +350,30 @@ private: InlineAsmIdentifierInfo Info; short BracCount; bool MemExpr; + bool OffsetOperator; + SMLoc OffsetOperatorLoc; + + bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) { + if (Sym) { + ErrMsg = "cannot use more than one symbol in memory operand"; + return true; + } + Sym = Val; + SymName = ID; + return false; + } public: IntelExprStateMachine() : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0), - MemExpr(false) {} + MemExpr(false), OffsetOperator(false) {} void addImm(int64_t imm) { Imm += imm; } short getBracCount() { return BracCount; } bool isMemExpr() { return MemExpr; } + bool isOffsetOperator() { return OffsetOperator; } + SMLoc getOffsetLoc() { return OffsetOperatorLoc; } unsigned getBaseReg() { return BaseReg; } unsigned getIndexReg() { return IndexReg; } unsigned getScale() { return Scale; } @@ -456,6 +470,7 @@ private: case IES_INTEGER: case IES_RPAREN: case IES_REGISTER: + case IES_OFFSET: State = IES_PLUS; IC.pushOperator(IC_PLUS); if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { @@ -500,10 +515,12 @@ private: case IES_INTEGER: case IES_REGISTER: case IES_INIT: + case IES_OFFSET: State = IES_MINUS; // push minus operator if it is not a negate operator if (CurrState == IES_REGISTER || CurrState == IES_RPAREN || - CurrState == IES_INTEGER || CurrState == IES_RBRAC) + CurrState == IES_INTEGER || CurrState == IES_RBRAC || + CurrState == IES_OFFSET) IC.pushOperator(IC_MINUS); else if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { // We have negate operator for Scale: it's illegal @@ -556,7 +573,6 @@ private: } PrevState = CurrState; } - bool onRegister(unsigned Reg, StringRef &ErrMsg) { IntelExprState CurrState = State; switch (State) { @@ -604,7 +620,6 @@ private: if (auto *CE = dyn_cast<MCConstantExpr>(SymRef)) return onInteger(CE->getValue(), ErrMsg); PrevState = State; - bool HasSymbol = Sym != nullptr; switch (State) { default: State = IES_ERROR; @@ -614,18 +629,16 @@ private: case IES_NOT: case IES_INIT: case IES_LBRAC: + if (setSymRef(SymRef, SymRefName, ErrMsg)) + return true; MemExpr = true; State = IES_INTEGER; - Sym = SymRef; - SymName = SymRefName; IC.pushOperand(IC_IMM); if (ParsingInlineAsm) Info = IDInfo; break; } - if (HasSymbol) - ErrMsg = "cannot use more than one symbol in memory operand"; - return HasSymbol; + return false; } bool onInteger(int64_t TmpInt, StringRef &ErrMsg) { IntelExprState CurrState = State; @@ -738,6 +751,7 @@ private: State = IES_ERROR; break; case IES_INTEGER: + case IES_OFFSET: case IES_REGISTER: case IES_RPAREN: if (BracCount-- != 1) @@ -792,6 +806,7 @@ private: State = IES_ERROR; break; case IES_INTEGER: + case IES_OFFSET: case IES_REGISTER: case IES_RPAREN: State = IES_RPAREN; @@ -799,6 +814,32 @@ private: break; } } + bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, + const InlineAsmIdentifierInfo &IDInfo, bool ParsingInlineAsm, + StringRef &ErrMsg) { + PrevState = State; + switch (State) { + default: + ErrMsg = "unexpected offset operator expression"; + return true; + case IES_PLUS: + case IES_INIT: + case IES_LBRAC: + if (setSymRef(Val, ID, ErrMsg)) + return true; + OffsetOperator = true; + OffsetOperatorLoc = OffsetLoc; + State = IES_OFFSET; + // As we cannot yet resolve the actual value (offset), we retain + // the requested semantics by pushing a '0' to the operands stack + IC.pushOperand(IC_IMM); + if (ParsingInlineAsm) { + Info = IDInfo; + } + break; + } + return false; + } }; bool Error(SMLoc L, const Twine &Msg, SMRange Range = None, @@ -830,18 +871,21 @@ private: std::unique_ptr<X86Operand> ParseOperand(); std::unique_ptr<X86Operand> ParseATTOperand(); std::unique_ptr<X86Operand> ParseIntelOperand(); - std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator(); + bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, + InlineAsmIdentifierInfo &Info, SMLoc &End); bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End); unsigned IdentifyIntelInlineAsmOperator(StringRef Name); unsigned ParseIntelInlineAsmOperator(unsigned OpKind); std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start); - bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM); + bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM, + bool &ParseError, SMLoc &End); void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start, SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); bool ParseIntelInlineAsmIdentifier(const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, - bool IsUnevaluatedOperand, SMLoc &End); + bool IsUnevaluatedOperand, SMLoc &End, + bool IsParsingOffsetOperator = false); std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, const MCExpr *&Disp, @@ -1112,9 +1156,10 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (RegNo == 0) RegNo = MatchRegisterName(Tok.getString().lower()); - // The "flags" register cannot be referenced directly. + // The "flags" and "mxcsr" registers cannot be referenced directly. // Treat it as an identifier instead. - if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS) + if (isParsingInlineAsm() && isParsingIntelSyntax() && + (RegNo == X86::EFLAGS || RegNo == X86::MXCSR)) RegNo = 0; if (!is64BitMode()) { @@ -1408,26 +1453,44 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // Some binary bitwise operators have a named synonymous // Query a candidate string for being such a named operator // and if so - invoke the appropriate handler -bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) { +bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, + IntelExprStateMachine &SM, + bool &ParseError, SMLoc &End) { // A named operator should be either lower or upper case, but not a mix if (Name.compare(Name.lower()) && Name.compare(Name.upper())) return false; - if (Name.equals_lower("not")) + if (Name.equals_lower("not")) { SM.onNot(); - else if (Name.equals_lower("or")) + } else if (Name.equals_lower("or")) { SM.onOr(); - else if (Name.equals_lower("shl")) + } else if (Name.equals_lower("shl")) { SM.onLShift(); - else if (Name.equals_lower("shr")) + } else if (Name.equals_lower("shr")) { SM.onRShift(); - else if (Name.equals_lower("xor")) + } else if (Name.equals_lower("xor")) { SM.onXor(); - else if (Name.equals_lower("and")) + } else if (Name.equals_lower("and")) { SM.onAnd(); - else if (Name.equals_lower("mod")) + } else if (Name.equals_lower("mod")) { SM.onMod(); - else + } else if (Name.equals_lower("offset")) { + SMLoc OffsetLoc = getTok().getLoc(); + const MCExpr *Val = nullptr; + StringRef ID; + InlineAsmIdentifierInfo Info; + ParseError = ParseIntelOffsetOperator(Val, ID, Info, End); + if (ParseError) + return true; + StringRef ErrMsg; + ParseError = + SM.onOffset(Val, OffsetLoc, ID, Info, isParsingInlineAsm(), ErrMsg); + if (ParseError) + return Error(SMLoc::getFromPointer(Name.data()), ErrMsg); + } else { return false; + } + if (!Name.equals_lower("offset")) + End = consumeToken(); return true; } @@ -1470,8 +1533,12 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { break; } // Operator synonymous ("not", "or" etc.) - if ((UpdateLocLex = ParseIntelNamedOperator(Identifier, SM))) + bool ParseError = false; + if (ParseIntelNamedOperator(Identifier, SM, ParseError, End)) { + if (ParseError) + return true; break; + } // Symbol reference, when parsing assembly content InlineAsmIdentifierInfo Info; const MCExpr *Val; @@ -1485,9 +1552,6 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } // MS InlineAsm operators (TYPE/LENGTH/SIZE) if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) { - if (OpKind == IOK_OFFSET) - return Error(IdentLoc, "Dealing OFFSET operator as part of" - "a compound immediate expression is yet to be supported"); if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) { if (SM.onInteger(Val, ErrMsg)) return Error(IdentLoc, ErrMsg); @@ -1589,9 +1653,9 @@ void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Loc = Start; unsigned ExprLen = End.getPointer() - Start.getPointer(); // Skip everything before a symbol displacement (if we have one) - if (SM.getSym()) { + if (SM.getSym() && !SM.isOffsetOperator()) { StringRef SymName = SM.getSymName(); - if (unsigned Len = SymName.data() - Start.getPointer()) + if (unsigned Len = SymName.data() - Start.getPointer()) InstInfo->AsmRewrites->emplace_back(AOK_Skip, Start, Len); Loc = SMLoc::getFromPointer(SymName.data() + SymName.size()); ExprLen = End.getPointer() - (SymName.data() + SymName.size()); @@ -1606,21 +1670,23 @@ void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM, // Build an Intel Expression rewrite StringRef BaseRegStr; StringRef IndexRegStr; + StringRef OffsetNameStr; if (SM.getBaseReg()) BaseRegStr = X86IntelInstPrinter::getRegisterName(SM.getBaseReg()); if (SM.getIndexReg()) IndexRegStr = X86IntelInstPrinter::getRegisterName(SM.getIndexReg()); + if (SM.isOffsetOperator()) + OffsetNameStr = SM.getSymName(); // Emit it - IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), SM.getImm(), SM.isMemExpr()); + IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), OffsetNameStr, + SM.getImm(), SM.isMemExpr()); InstInfo->AsmRewrites->emplace_back(Loc, ExprLen, Expr); } // Inline assembly may use variable names with namespace alias qualifiers. -bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val, - StringRef &Identifier, - InlineAsmIdentifierInfo &Info, - bool IsUnevaluatedOperand, - SMLoc &End) { +bool X86AsmParser::ParseIntelInlineAsmIdentifier( + const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator) { MCAsmParser &Parser = getParser(); assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; @@ -1653,9 +1719,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val, SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(), Loc, false); assert(InternalName.size() && "We should have an internal name here."); - // Push a rewrite for replacing the identifier name with the internal name. - InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), - InternalName); + // Push a rewrite for replacing the identifier name with the internal name, + // unless we are parsing the operand of an offset operator + if (!IsParsingOffsetOperator) + InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), + InternalName); + else + Identifier = InternalName; } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) return false; // Create the symbol reference. @@ -1738,39 +1808,25 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) return false; } -/// Parse the 'offset' operator. This operator is used to specify the -/// location rather then the content of a variable. -std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { - MCAsmParser &Parser = getParser(); - const AsmToken &Tok = Parser.getTok(); - SMLoc OffsetOfLoc = Tok.getLoc(); - Parser.Lex(); // Eat offset. - - const MCExpr *Val; - InlineAsmIdentifierInfo Info; - SMLoc Start = Tok.getLoc(), End; - StringRef Identifier = Tok.getString(); - if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, - /*Unevaluated=*/false, End)) - return nullptr; - - void *Decl = nullptr; - // FIXME: MS evaluates "offset <Constant>" to the underlying integral - if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) - return ErrorOperand(Start, "offset operator cannot yet handle constants"); - else if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) - Decl = Info.Var.Decl; - // Don't emit the offset operator. - InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7); - - // The offset operator will have an 'r' constraint, thus we need to create - // register operand to ensure proper matching. Just pick a GPR based on - // the size of a pointer. - bool Parse32 = is32BitMode() || Code16GCC; - unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX); - - return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true, - OffsetOfLoc, Identifier, Decl); +/// Parse the 'offset' operator. +/// This operator is used to specify the location of a given operand +bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, + InlineAsmIdentifierInfo &Info, + SMLoc &End) { + // Eat offset, mark start of identifier. + SMLoc Start = Lex().getLoc(); + ID = getTok().getString(); + if (!isParsingInlineAsm()) { + if ((getTok().isNot(AsmToken::Identifier) && + getTok().isNot(AsmToken::String)) || + getParser().parsePrimaryExpr(Val, End)) + return Error(Start, "unexpected token!"); + } else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) { + return Error(Start, "unable to lookup expression"); + } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) { + return Error(Start, "offset operator cannot yet handle constants"); + } + return false; } // Query a candidate string for being an Intel assembly operator @@ -1780,7 +1836,6 @@ unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) { .Cases("TYPE","type",IOK_TYPE) .Cases("SIZE","size",IOK_SIZE) .Cases("LENGTH","length",IOK_LENGTH) - .Cases("OFFSET","offset",IOK_OFFSET) .Default(IOK_INVALID); } @@ -1850,13 +1905,6 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; - // FIXME: Offset operator - // Should be handled as part of immediate expression, as other operators - // Currently, only supported as a stand-alone operand - if (isParsingInlineAsm()) - if (IdentifyIntelInlineAsmOperator(Tok.getString()) == IOK_OFFSET) - return ParseIntelOffsetOfOperator(); - // Parse optional Size directive. unsigned Size; if (ParseIntelMemoryOperandSize(Size)) @@ -1904,8 +1952,19 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { // RegNo != 0 specifies a valid segment register, // and we are parsing a segment override - if (!SM.isMemExpr() && !RegNo) + if (!SM.isMemExpr() && !RegNo) { + if (isParsingInlineAsm() && SM.isOffsetOperator()) { + const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo(); + if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { + // Disp includes the address of a variable; make sure this is recorded + // for later handling. + return X86Operand::CreateImm(Disp, Start, End, SM.getSymName(), + Info.Var.Decl, Info.Var.IsGlobalLV); + } + } + return X86Operand::CreateImm(Disp, Start, End); + } StringRef ErrMsg; unsigned BaseReg = SM.getBaseReg(); @@ -3131,6 +3190,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) { case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int: if (ForcedVEXEncoding != VEXEncoding_EVEX) return Match_Unsupported; + break; } return Match_Success; @@ -3879,7 +3939,7 @@ bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) { } // Force static initialization. -extern "C" void LLVMInitializeX86AsmParser() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86AsmParser() { RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target()); RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target()); } diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 3a76d023e640..d831a63b04ee 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -36,6 +36,7 @@ struct X86Operand final : public MCParsedAsmOperand { StringRef SymName; void *OpDecl; bool AddressOf; + bool CallOperand; struct TokOp { const char *Data; @@ -52,6 +53,7 @@ struct X86Operand final : public MCParsedAsmOperand { struct ImmOp { const MCExpr *Val; + bool LocalRef; }; struct MemOp { @@ -77,7 +79,7 @@ struct X86Operand final : public MCParsedAsmOperand { }; X86Operand(KindTy K, SMLoc Start, SMLoc End) - : Kind(K), StartLoc(Start), EndLoc(End) {} + : Kind(K), StartLoc(Start), EndLoc(End), CallOperand(false) {} StringRef getSymName() override { return SymName; } void *getOpDecl() override { return OpDecl; } @@ -104,8 +106,8 @@ struct X86Operand final : public MCParsedAsmOperand { } else if (Val->getKind() == MCExpr::SymbolRef) { if (auto *SRE = dyn_cast<MCSymbolRefExpr>(Val)) { const MCSymbol &Sym = SRE->getSymbol(); - if (auto SymName = Sym.getName().data()) - OS << VName << SymName; + if (const char *SymNameStr = Sym.getName().data()) + OS << VName << SymNameStr; } } }; @@ -278,13 +280,9 @@ struct X86Operand final : public MCParsedAsmOperand { return isImmUnsignedi8Value(CE->getValue()); } - bool isOffsetOf() const override { - return OffsetOfLoc.getPointer(); - } + bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; } - bool needAddressOf() const override { - return AddressOf; - } + bool needAddressOf() const override { return AddressOf; } bool isMem() const override { return Kind == Memory; } bool isMemUnsized() const { @@ -613,9 +611,16 @@ struct X86Operand final : public MCParsedAsmOperand { } static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, - SMLoc StartLoc, SMLoc EndLoc) { + SMLoc StartLoc, SMLoc EndLoc, + StringRef SymName = StringRef(), + void *OpDecl = nullptr, + bool GlobalRef = true) { auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); - Res->Imm.Val = Val; + Res->Imm.Val = Val; + Res->Imm.LocalRef = !GlobalRef; + Res->SymName = SymName; + Res->OpDecl = OpDecl; + Res->AddressOf = true; return Res; } diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 9a635bbe5f85..ea8c606d1564 100644 --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -84,6 +84,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -92,24 +93,1552 @@ using namespace llvm::X86Disassembler; #define DEBUG_TYPE "x86-disassembler" -void llvm::X86Disassembler::Debug(const char *file, unsigned line, - const char *s) { - dbgs() << file << ":" << line << ": " << s; +#define debug(s) LLVM_DEBUG(dbgs() << __LINE__ << ": " << s); + +// Specifies whether a ModR/M byte is needed and (if so) which +// instruction each possible value of the ModR/M byte corresponds to. Once +// this information is known, we have narrowed down to a single instruction. +struct ModRMDecision { + uint8_t modrm_type; + uint16_t instructionIDs; +}; + +// Specifies which set of ModR/M->instruction tables to look at +// given a particular opcode. +struct OpcodeDecision { + ModRMDecision modRMDecisions[256]; +}; + +// Specifies which opcode->instruction tables to look at given +// a particular context (set of attributes). Since there are many possible +// contexts, the decoder first uses CONTEXTS_SYM to determine which context +// applies given a specific set of attributes. Hence there are only IC_max +// entries in this table, rather than 2^(ATTR_max). +struct ContextDecision { + OpcodeDecision opcodeDecisions[IC_max]; +}; + +#include "X86GenDisassemblerTables.inc" + +static InstrUID decode(OpcodeType type, InstructionContext insnContext, + uint8_t opcode, uint8_t modRM) { + const struct ModRMDecision *dec; + + switch (type) { + case ONEBYTE: + dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case TWOBYTE: + dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_38: + dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_3A: + dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOP8_MAP: + dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOP9_MAP: + dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case XOPA_MAP: + dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEDNOW_MAP: + dec = + &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + } + + switch (dec->modrm_type) { + default: + llvm_unreachable("Corrupt table! Unknown modrm_type"); + return 0; + case MODRM_ONEENTRY: + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITRM: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs + 1]; + return modRMTable[dec->instructionIDs]; + case MODRM_SPLITREG: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3) + 8]; + return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)]; + case MODRM_SPLITMISC: + if (modFromModRM(modRM) == 0x3) + return modRMTable[dec->instructionIDs + (modRM & 0x3f) + 8]; + return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)]; + case MODRM_FULL: + return modRMTable[dec->instructionIDs + modRM]; + } } -StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode, - const void *mii) { - const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii); - return MII->getName(Opcode); +static bool peek(struct InternalInstruction *insn, uint8_t &byte) { + uint64_t offset = insn->readerCursor - insn->startLocation; + if (offset >= insn->bytes.size()) + return true; + byte = insn->bytes[offset]; + return false; } -#define debug(s) LLVM_DEBUG(Debug(__FILE__, __LINE__, s)); +template <typename T> static bool consume(InternalInstruction *insn, T &ptr) { + auto r = insn->bytes; + uint64_t offset = insn->readerCursor - insn->startLocation; + if (offset + sizeof(T) > r.size()) + return true; + T ret = 0; + for (unsigned i = 0; i < sizeof(T); ++i) + ret |= (uint64_t)r[offset + i] << (i * 8); + ptr = ret; + insn->readerCursor += sizeof(T); + return false; +} + +static bool isREX(struct InternalInstruction *insn, uint8_t prefix) { + return insn->mode == MODE_64BIT && prefix >= 0x40 && prefix <= 0x4f; +} + +// Consumes all of an instruction's prefix bytes, and marks the +// instruction as having them. Also sets the instruction's default operand, +// address, and other relevant data sizes to report operands correctly. +// +// insn must not be empty. +static int readPrefixes(struct InternalInstruction *insn) { + bool isPrefix = true; + uint8_t byte = 0; + uint8_t nextByte; + + LLVM_DEBUG(dbgs() << "readPrefixes()"); + + while (isPrefix) { + // If we fail reading prefixes, just stop here and let the opcode reader + // deal with it. + if (consume(insn, byte)) + break; + + // If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then + // break and let it be disassembled as a normal "instruction". + if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK + break; + + if ((byte == 0xf2 || byte == 0xf3) && !peek(insn, nextByte)) { + // If the byte is 0xf2 or 0xf3, and any of the following conditions are + // met: + // - it is followed by a LOCK (0xf0) prefix + // - it is followed by an xchg instruction + // then it should be disassembled as a xacquire/xrelease not repne/rep. + if (((nextByte == 0xf0) || + ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) { + insn->xAcquireRelease = true; + if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support + break; + } + // Also if the byte is 0xf3, and the following condition is met: + // - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or + // "mov mem, imm" (opcode 0xc6/0xc7) instructions. + // then it should be disassembled as an xrelease not rep. + if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 || + nextByte == 0xc6 || nextByte == 0xc7)) { + insn->xAcquireRelease = true; + break; + } + if (isREX(insn, nextByte)) { + uint8_t nnextByte; + // Go to REX prefix after the current one + if (consume(insn, nnextByte)) + return -1; + // We should be able to read next byte after REX prefix + if (peek(insn, nnextByte)) + return -1; + --insn->readerCursor; + } + } + + switch (byte) { + case 0xf0: // LOCK + insn->hasLockPrefix = true; + break; + case 0xf2: // REPNE/REPNZ + case 0xf3: { // REP or REPE/REPZ + uint8_t nextByte; + if (peek(insn, nextByte)) + break; + // TODO: + // 1. There could be several 0x66 + // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then + // it's not mandatory prefix + // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need + // 0x0f exactly after it to be mandatory prefix + if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66) + // The last of 0xf2 /0xf3 is mandatory prefix + insn->mandatoryPrefix = byte; + insn->repeatPrefix = byte; + break; + } + case 0x2e: // CS segment override -OR- Branch not taken + insn->segmentOverride = SEG_OVERRIDE_CS; + break; + case 0x36: // SS segment override -OR- Branch taken + insn->segmentOverride = SEG_OVERRIDE_SS; + break; + case 0x3e: // DS segment override + insn->segmentOverride = SEG_OVERRIDE_DS; + break; + case 0x26: // ES segment override + insn->segmentOverride = SEG_OVERRIDE_ES; + break; + case 0x64: // FS segment override + insn->segmentOverride = SEG_OVERRIDE_FS; + break; + case 0x65: // GS segment override + insn->segmentOverride = SEG_OVERRIDE_GS; + break; + case 0x66: { // Operand-size override { + uint8_t nextByte; + insn->hasOpSize = true; + if (peek(insn, nextByte)) + break; + // 0x66 can't overwrite existing mandatory prefix and should be ignored + if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte))) + insn->mandatoryPrefix = byte; + break; + } + case 0x67: // Address-size override + insn->hasAdSize = true; + break; + default: // Not a prefix byte + isPrefix = false; + break; + } + + if (isPrefix) + LLVM_DEBUG(dbgs() << format("Found prefix 0x%hhx", byte)); + } + + insn->vectorExtensionType = TYPE_NO_VEX_XOP; + + if (byte == 0x62) { + uint8_t byte1, byte2; + if (consume(insn, byte1)) { + LLVM_DEBUG(dbgs() << "Couldn't read second byte of EVEX prefix"); + return -1; + } + + if (peek(insn, byte2)) { + LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix"); + return -1; + } + + if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) && + ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) { + insn->vectorExtensionType = TYPE_EVEX; + } else { + --insn->readerCursor; // unconsume byte1 + --insn->readerCursor; // unconsume byte + } + + if (insn->vectorExtensionType == TYPE_EVEX) { + insn->vectorExtensionPrefix[0] = byte; + insn->vectorExtensionPrefix[1] = byte1; + if (consume(insn, insn->vectorExtensionPrefix[2])) { + LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix"); + return -1; + } + if (consume(insn, insn->vectorExtensionPrefix[3])) { + LLVM_DEBUG(dbgs() << "Couldn't read fourth byte of EVEX prefix"); + return -1; + } + + // We simulate the REX prefix for simplicity's sake + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 | + (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3) | + (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2) | + (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1) | + (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0); + } + + LLVM_DEBUG( + dbgs() << format( + "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], + insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3])); + } + } else if (byte == 0xc4) { + uint8_t byte1; + if (peek(insn, byte1)) { + LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) + insn->vectorExtensionType = TYPE_VEX_3B; + else + --insn->readerCursor; + + if (insn->vectorExtensionType == TYPE_VEX_3B) { + insn->vectorExtensionPrefix[0] = byte; + consume(insn, insn->vectorExtensionPrefix[1]); + consume(insn, insn->vectorExtensionPrefix[2]); + + // We simulate the REX prefix for simplicity's sake + + if (insn->mode == MODE_64BIT) + insn->rexPrefix = 0x40 | + (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) | + (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) | + (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) | + (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0); + + LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], + insn->vectorExtensionPrefix[1], + insn->vectorExtensionPrefix[2])); + } + } else if (byte == 0xc5) { + uint8_t byte1; + if (peek(insn, byte1)) { + LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) + insn->vectorExtensionType = TYPE_VEX_2B; + else + --insn->readerCursor; + + if (insn->vectorExtensionType == TYPE_VEX_2B) { + insn->vectorExtensionPrefix[0] = byte; + consume(insn, insn->vectorExtensionPrefix[1]); + + if (insn->mode == MODE_64BIT) + insn->rexPrefix = + 0x40 | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2); + + switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { + default: + break; + case VEX_PREFIX_66: + insn->hasOpSize = true; + break; + } + + LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], + insn->vectorExtensionPrefix[1])); + } + } else if (byte == 0x8f) { + uint8_t byte1; + if (peek(insn, byte1)) { + LLVM_DEBUG(dbgs() << "Couldn't read second byte of XOP"); + return -1; + } + + if ((byte1 & 0x38) != 0x0) // 0 in these 3 bits is a POP instruction. + insn->vectorExtensionType = TYPE_XOP; + else + --insn->readerCursor; + + if (insn->vectorExtensionType == TYPE_XOP) { + insn->vectorExtensionPrefix[0] = byte; + consume(insn, insn->vectorExtensionPrefix[1]); + consume(insn, insn->vectorExtensionPrefix[2]); + + // We simulate the REX prefix for simplicity's sake + + if (insn->mode == MODE_64BIT) + insn->rexPrefix = 0x40 | + (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) | + (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) | + (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) | + (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0); + + switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { + default: + break; + case VEX_PREFIX_66: + insn->hasOpSize = true; + break; + } + + LLVM_DEBUG(dbgs() << format("Found XOP prefix 0x%hhx 0x%hhx 0x%hhx", + insn->vectorExtensionPrefix[0], + insn->vectorExtensionPrefix[1], + insn->vectorExtensionPrefix[2])); + } + } else if (isREX(insn, byte)) { + if (peek(insn, nextByte)) + return -1; + insn->rexPrefix = byte; + LLVM_DEBUG(dbgs() << format("Found REX prefix 0x%hhx", byte)); + } else + --insn->readerCursor; + + if (insn->mode == MODE_16BIT) { + insn->registerSize = (insn->hasOpSize ? 4 : 2); + insn->addressSize = (insn->hasAdSize ? 4 : 2); + insn->displacementSize = (insn->hasAdSize ? 4 : 2); + insn->immediateSize = (insn->hasOpSize ? 4 : 2); + } else if (insn->mode == MODE_32BIT) { + insn->registerSize = (insn->hasOpSize ? 2 : 4); + insn->addressSize = (insn->hasAdSize ? 2 : 4); + insn->displacementSize = (insn->hasAdSize ? 2 : 4); + insn->immediateSize = (insn->hasOpSize ? 2 : 4); + } else if (insn->mode == MODE_64BIT) { + if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { + insn->registerSize = 8; + insn->addressSize = (insn->hasAdSize ? 4 : 8); + insn->displacementSize = 4; + insn->immediateSize = 4; + } else { + insn->registerSize = (insn->hasOpSize ? 2 : 4); + insn->addressSize = (insn->hasAdSize ? 4 : 8); + insn->displacementSize = (insn->hasOpSize ? 2 : 4); + insn->immediateSize = (insn->hasOpSize ? 2 : 4); + } + } + + return 0; +} + +// Consumes the SIB byte to determine addressing information. +static int readSIB(struct InternalInstruction *insn) { + SIBBase sibBaseBase = SIB_BASE_NONE; + uint8_t index, base; + + LLVM_DEBUG(dbgs() << "readSIB()"); + switch (insn->addressSize) { + case 2: + default: + llvm_unreachable("SIB-based addressing doesn't work in 16-bit mode"); + case 4: + insn->sibIndexBase = SIB_INDEX_EAX; + sibBaseBase = SIB_BASE_EAX; + break; + case 8: + insn->sibIndexBase = SIB_INDEX_RAX; + sibBaseBase = SIB_BASE_RAX; + break; + } + + if (consume(insn, insn->sib)) + return -1; + + index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); + + if (index == 0x4) { + insn->sibIndex = SIB_INDEX_NONE; + } else { + insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index); + } + + insn->sibScale = 1 << scaleFromSIB(insn->sib); + + base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); + + switch (base) { + case 0x5: + case 0xd: + switch (modFromModRM(insn->modRM)) { + case 0x0: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = SIB_BASE_NONE; + break; + case 0x1: + insn->eaDisplacement = EA_DISP_8; + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + case 0x2: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + default: + llvm_unreachable("Cannot have Mod = 0b11 and a SIB byte"); + } + break; + default: + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + } + + return 0; +} + +static int readDisplacement(struct InternalInstruction *insn) { + int8_t d8; + int16_t d16; + int32_t d32; + LLVM_DEBUG(dbgs() << "readDisplacement()"); + + insn->displacementOffset = insn->readerCursor - insn->startLocation; + switch (insn->eaDisplacement) { + case EA_DISP_NONE: + break; + case EA_DISP_8: + if (consume(insn, d8)) + return -1; + insn->displacement = d8; + break; + case EA_DISP_16: + if (consume(insn, d16)) + return -1; + insn->displacement = d16; + break; + case EA_DISP_32: + if (consume(insn, d32)) + return -1; + insn->displacement = d32; + break; + } + + return 0; +} + +// Consumes all addressing information (ModR/M byte, SIB byte, and displacement. +static int readModRM(struct InternalInstruction *insn) { + uint8_t mod, rm, reg, evexrm; + LLVM_DEBUG(dbgs() << "readModRM()"); + + if (insn->consumedModRM) + return 0; + + if (consume(insn, insn->modRM)) + return -1; + insn->consumedModRM = true; + + mod = modFromModRM(insn->modRM); + rm = rmFromModRM(insn->modRM); + reg = regFromModRM(insn->modRM); + + // This goes by insn->registerSize to pick the correct register, which messes + // up if we're using (say) XMM or 8-bit register operands. That gets fixed in + // fixupReg(). + switch (insn->registerSize) { + case 2: + insn->regBase = MODRM_REG_AX; + insn->eaRegBase = EA_REG_AX; + break; + case 4: + insn->regBase = MODRM_REG_EAX; + insn->eaRegBase = EA_REG_EAX; + break; + case 8: + insn->regBase = MODRM_REG_RAX; + insn->eaRegBase = EA_REG_RAX; + break; + } + + reg |= rFromREX(insn->rexPrefix) << 3; + rm |= bFromREX(insn->rexPrefix) << 3; + + evexrm = 0; + if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) { + reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4; + evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4; + } + + insn->reg = (Reg)(insn->regBase + reg); + + switch (insn->addressSize) { + case 2: { + EABase eaBaseBase = EA_BASE_BX_SI; + + switch (mod) { + case 0x0: + if (rm == 0x6) { + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + } else { + insn->eaBase = (EABase)(eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_NONE; + } + break; + case 0x1: + insn->eaBase = (EABase)(eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_8; + insn->displacementSize = 1; + if (readDisplacement(insn)) + return -1; + break; + case 0x2: + insn->eaBase = (EABase)(eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + break; + case 0x3: + insn->eaBase = (EABase)(insn->eaRegBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + } + case 4: + case 8: { + EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); + + switch (mod) { + case 0x0: + insn->eaDisplacement = EA_DISP_NONE; // readSIB may override this + // In determining whether RIP-relative mode is used (rm=5), + // or whether a SIB byte is present (rm=4), + // the extension bits (REX.b and EVEX.x) are ignored. + switch (rm & 7) { + case 0x4: // SIB byte is present + insn->eaBase = (insn->addressSize == 4 ? EA_BASE_sib : EA_BASE_sib64); + if (readSIB(insn) || readDisplacement(insn)) + return -1; + break; + case 0x5: // RIP-relative + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_32; + if (readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(eaBaseBase + rm); + break; + } + break; + case 0x1: + insn->displacementSize = 1; + LLVM_FALLTHROUGH; + case 0x2: + insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); + switch (rm & 7) { + case 0x4: // SIB byte is present + insn->eaBase = EA_BASE_sib; + if (readSIB(insn) || readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(eaBaseBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 0x3: + insn->eaDisplacement = EA_DISP_NONE; + insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm); + break; + } + break; + } + } // switch (insn->addressSize) + + return 0; +} + +#define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \ + static uint16_t name(struct InternalInstruction *insn, OperandType type, \ + uint8_t index, uint8_t *valid) { \ + *valid = 1; \ + switch (type) { \ + default: \ + debug("Unhandled register type"); \ + *valid = 0; \ + return 0; \ + case TYPE_Rv: \ + return base + index; \ + case TYPE_R8: \ + index &= mask; \ + if (index > 0xf) \ + *valid = 0; \ + if (insn->rexPrefix && index >= 4 && index <= 7) { \ + return prefix##_SPL + (index - 4); \ + } else { \ + return prefix##_AL + index; \ + } \ + case TYPE_R16: \ + index &= mask; \ + if (index > 0xf) \ + *valid = 0; \ + return prefix##_AX + index; \ + case TYPE_R32: \ + index &= mask; \ + if (index > 0xf) \ + *valid = 0; \ + return prefix##_EAX + index; \ + case TYPE_R64: \ + index &= mask; \ + if (index > 0xf) \ + *valid = 0; \ + return prefix##_RAX + index; \ + case TYPE_ZMM: \ + return prefix##_ZMM0 + index; \ + case TYPE_YMM: \ + return prefix##_YMM0 + index; \ + case TYPE_XMM: \ + return prefix##_XMM0 + index; \ + case TYPE_VK: \ + index &= 0xf; \ + if (index > 7) \ + *valid = 0; \ + return prefix##_K0 + index; \ + case TYPE_VK_PAIR: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_K0_K1 + (index / 2); \ + case TYPE_MM64: \ + return prefix##_MM0 + (index & 0x7); \ + case TYPE_SEGMENTREG: \ + if ((index & 7) > 5) \ + *valid = 0; \ + return prefix##_ES + (index & 7); \ + case TYPE_DEBUGREG: \ + return prefix##_DR0 + index; \ + case TYPE_CONTROLREG: \ + return prefix##_CR0 + index; \ + case TYPE_BNDR: \ + if (index > 3) \ + *valid = 0; \ + return prefix##_BND0 + index; \ + case TYPE_MVSIBX: \ + return prefix##_XMM0 + index; \ + case TYPE_MVSIBY: \ + return prefix##_YMM0 + index; \ + case TYPE_MVSIBZ: \ + return prefix##_ZMM0 + index; \ + } \ + } + +// Consult an operand type to determine the meaning of the reg or R/M field. If +// the operand is an XMM operand, for example, an operand would be XMM0 instead +// of AX, which readModRM() would otherwise misinterpret it as. +// +// @param insn - The instruction containing the operand. +// @param type - The operand type. +// @param index - The existing value of the field as reported by readModRM(). +// @param valid - The address of a uint8_t. The target is set to 1 if the +// field is valid for the register class; 0 if not. +// @return - The proper value. +GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f) +GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf) + +// Consult an operand specifier to determine which of the fixup*Value functions +// to use in correcting readModRM()'ss interpretation. +// +// @param insn - See fixup*Value(). +// @param op - The operand specifier. +// @return - 0 if fixup was successful; -1 if the register returned was +// invalid for its class. +static int fixupReg(struct InternalInstruction *insn, + const struct OperandSpecifier *op) { + uint8_t valid; + LLVM_DEBUG(dbgs() << "fixupReg()"); + + switch ((OperandEncoding)op->encoding) { + default: + debug("Expected a REG or R/M encoding in fixupReg"); + return -1; + case ENCODING_VVVV: + insn->vvvv = + (Reg)fixupRegValue(insn, (OperandType)op->type, insn->vvvv, &valid); + if (!valid) + return -1; + break; + case ENCODING_REG: + insn->reg = (Reg)fixupRegValue(insn, (OperandType)op->type, + insn->reg - insn->regBase, &valid); + if (!valid) + return -1; + break; + CASE_ENCODING_RM: + if (insn->eaBase >= insn->eaRegBase) { + insn->eaBase = (EABase)fixupRMValue( + insn, (OperandType)op->type, insn->eaBase - insn->eaRegBase, &valid); + if (!valid) + return -1; + } + break; + } + + return 0; +} + +// Read the opcode (except the ModR/M byte in the case of extended or escape +// opcodes). +static bool readOpcode(struct InternalInstruction *insn) { + uint8_t current; + LLVM_DEBUG(dbgs() << "readOpcode()"); + + insn->opcodeType = ONEBYTE; + if (insn->vectorExtensionType == TYPE_EVEX) { + switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { + default: + LLVM_DEBUG( + dbgs() << format("Unhandled mm field for instruction (0x%hhx)", + mmFromEVEX2of4(insn->vectorExtensionPrefix[1]))); + return true; + case VEX_LOB_0F: + insn->opcodeType = TWOBYTE; + return consume(insn, insn->opcode); + case VEX_LOB_0F38: + insn->opcodeType = THREEBYTE_38; + return consume(insn, insn->opcode); + case VEX_LOB_0F3A: + insn->opcodeType = THREEBYTE_3A; + return consume(insn, insn->opcode); + } + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { + switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) { + default: + LLVM_DEBUG( + dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)", + mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]))); + return true; + case VEX_LOB_0F: + insn->opcodeType = TWOBYTE; + return consume(insn, insn->opcode); + case VEX_LOB_0F38: + insn->opcodeType = THREEBYTE_38; + return consume(insn, insn->opcode); + case VEX_LOB_0F3A: + insn->opcodeType = THREEBYTE_3A; + return consume(insn, insn->opcode); + } + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { + insn->opcodeType = TWOBYTE; + return consume(insn, insn->opcode); + } else if (insn->vectorExtensionType == TYPE_XOP) { + switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) { + default: + LLVM_DEBUG( + dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)", + mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]))); + return true; + case XOP_MAP_SELECT_8: + insn->opcodeType = XOP8_MAP; + return consume(insn, insn->opcode); + case XOP_MAP_SELECT_9: + insn->opcodeType = XOP9_MAP; + return consume(insn, insn->opcode); + case XOP_MAP_SELECT_A: + insn->opcodeType = XOPA_MAP; + return consume(insn, insn->opcode); + } + } + + if (consume(insn, current)) + return true; + + if (current == 0x0f) { + LLVM_DEBUG( + dbgs() << format("Found a two-byte escape prefix (0x%hhx)", current)); + if (consume(insn, current)) + return true; + + if (current == 0x38) { + LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)", + current)); + if (consume(insn, current)) + return true; + + insn->opcodeType = THREEBYTE_38; + } else if (current == 0x3a) { + LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)", + current)); + if (consume(insn, current)) + return true; + + insn->opcodeType = THREEBYTE_3A; + } else if (current == 0x0f) { + LLVM_DEBUG( + dbgs() << format("Found a 3dnow escape prefix (0x%hhx)", current)); + + // Consume operands before the opcode to comply with the 3DNow encoding + if (readModRM(insn)) + return true; + + if (consume(insn, current)) + return true; + + insn->opcodeType = THREEDNOW_MAP; + } else { + LLVM_DEBUG(dbgs() << "Didn't find a three-byte escape prefix"); + insn->opcodeType = TWOBYTE; + } + } else if (insn->mandatoryPrefix) + // The opcode with mandatory prefix must start with opcode escape. + // If not it's legacy repeat prefix + insn->mandatoryPrefix = 0; + + // At this point we have consumed the full opcode. + // Anything we consume from here on must be unconsumed. + insn->opcode = current; + + return false; +} + +// Determine whether equiv is the 16-bit equivalent of orig (32-bit or 64-bit). +static bool is16BitEquivalent(const char *orig, const char *equiv) { + for (int i = 0;; i++) { + if (orig[i] == '\0' && equiv[i] == '\0') + return true; + if (orig[i] == '\0' || equiv[i] == '\0') + return false; + if (orig[i] != equiv[i]) { + if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') + continue; + if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') + continue; + if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') + continue; + return false; + } + } +} + +// Determine whether this instruction is a 64-bit instruction. +static bool is64Bit(const char *name) { + for (int i = 0;; ++i) { + if (name[i] == '\0') + return false; + if (name[i] == '6' && name[i + 1] == '4') + return true; + } +} + +// Determine the ID of an instruction, consuming the ModR/M byte as appropriate +// for extended and escape opcodes, and using a supplied attribute mask. +static int getInstructionIDWithAttrMask(uint16_t *instructionID, + struct InternalInstruction *insn, + uint16_t attrMask) { + auto insnCtx = InstructionContext(x86DisassemblerContexts[attrMask]); + const ContextDecision *decision; + switch (insn->opcodeType) { + case ONEBYTE: + decision = &ONEBYTE_SYM; + break; + case TWOBYTE: + decision = &TWOBYTE_SYM; + break; + case THREEBYTE_38: + decision = &THREEBYTE38_SYM; + break; + case THREEBYTE_3A: + decision = &THREEBYTE3A_SYM; + break; + case XOP8_MAP: + decision = &XOP8_MAP_SYM; + break; + case XOP9_MAP: + decision = &XOP9_MAP_SYM; + break; + case XOPA_MAP: + decision = &XOPA_MAP_SYM; + break; + case THREEDNOW_MAP: + decision = &THREEDNOW_MAP_SYM; + break; + } + + if (decision->opcodeDecisions[insnCtx] + .modRMDecisions[insn->opcode] + .modrm_type != MODRM_ONEENTRY) { + if (readModRM(insn)) + return -1; + *instructionID = + decode(insn->opcodeType, insnCtx, insn->opcode, insn->modRM); + } else { + *instructionID = decode(insn->opcodeType, insnCtx, insn->opcode, 0); + } + + return 0; +} + +// Determine the ID of an instruction, consuming the ModR/M byte as appropriate +// for extended and escape opcodes. Determines the attributes and context for +// the instruction before doing so. +static int getInstructionID(struct InternalInstruction *insn, + const MCInstrInfo *mii) { + uint16_t attrMask; + uint16_t instructionID; + + LLVM_DEBUG(dbgs() << "getID()"); + + attrMask = ATTR_NONE; + + if (insn->mode == MODE_64BIT) + attrMask |= ATTR_64BIT; + + if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) { + attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX; + + if (insn->vectorExtensionType == TYPE_EVEX) { + switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (zFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXKZ; + if (bFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXB; + if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXK; + if (lFromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_VEXL; + if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) + attrMask |= ATTR_EVEXL2; + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { + switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX3of3(insn->vectorExtensionPrefix[2])) + attrMask |= ATTR_VEXL; + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { + switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX2of2(insn->vectorExtensionPrefix[1])) + attrMask |= ATTR_VEXL; + } else if (insn->vectorExtensionType == TYPE_XOP) { + switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromXOP3of3(insn->vectorExtensionPrefix[2])) + attrMask |= ATTR_VEXL; + } else { + return -1; + } + } else if (!insn->mandatoryPrefix) { + // If we don't have mandatory prefix we should use legacy prefixes here + if (insn->hasOpSize && (insn->mode != MODE_16BIT)) + attrMask |= ATTR_OPSIZE; + if (insn->hasAdSize) + attrMask |= ATTR_ADSIZE; + if (insn->opcodeType == ONEBYTE) { + if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90)) + // Special support for PAUSE + attrMask |= ATTR_XS; + } else { + if (insn->repeatPrefix == 0xf2) + attrMask |= ATTR_XD; + else if (insn->repeatPrefix == 0xf3) + attrMask |= ATTR_XS; + } + } else { + switch (insn->mandatoryPrefix) { + case 0xf2: + attrMask |= ATTR_XD; + break; + case 0xf3: + attrMask |= ATTR_XS; + break; + case 0x66: + if (insn->mode != MODE_16BIT) + attrMask |= ATTR_OPSIZE; + break; + case 0x67: + attrMask |= ATTR_ADSIZE; + break; + } + } + + if (insn->rexPrefix & 0x08) { + attrMask |= ATTR_REXW; + attrMask &= ~ATTR_ADSIZE; + } + + if (insn->mode == MODE_16BIT) { + // JCXZ/JECXZ need special handling for 16-bit mode because the meaning + // of the AdSize prefix is inverted w.r.t. 32-bit mode. + if (insn->opcodeType == ONEBYTE && insn->opcode == 0xE3) + attrMask ^= ATTR_ADSIZE; + // If we're in 16-bit mode and this is one of the relative jumps and opsize + // prefix isn't present, we need to force the opsize attribute since the + // prefix is inverted relative to 32-bit mode. + if (!insn->hasOpSize && insn->opcodeType == ONEBYTE && + (insn->opcode == 0xE8 || insn->opcode == 0xE9)) + attrMask |= ATTR_OPSIZE; + + if (!insn->hasOpSize && insn->opcodeType == TWOBYTE && + insn->opcode >= 0x80 && insn->opcode <= 0x8F) + attrMask |= ATTR_OPSIZE; + } + + + if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + // The following clauses compensate for limitations of the tables. + + if (insn->mode != MODE_64BIT && + insn->vectorExtensionType != TYPE_NO_VEX_XOP) { + // The tables can't distinquish between cases where the W-bit is used to + // select register size and cases where its a required part of the opcode. + if ((insn->vectorExtensionType == TYPE_EVEX && + wFromEVEX3of4(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_VEX_3B && + wFromVEX3of3(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_XOP && + wFromXOP3of3(insn->vectorExtensionPrefix[2]))) { + + uint16_t instructionIDWithREXW; + if (getInstructionIDWithAttrMask(&instructionIDWithREXW, insn, + attrMask | ATTR_REXW)) { + insn->instructionID = instructionID; + insn->spec = &INSTRUCTIONS_SYM[instructionID]; + return 0; + } + + auto SpecName = mii->getName(instructionIDWithREXW); + // If not a 64-bit instruction. Switch the opcode. + if (!is64Bit(SpecName.data())) { + insn->instructionID = instructionIDWithREXW; + insn->spec = &INSTRUCTIONS_SYM[instructionIDWithREXW]; + return 0; + } + } + } + + // Absolute moves, umonitor, and movdir64b need special handling. + // -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are + // inverted w.r.t. + // -For 32-bit mode we need to ensure the ADSIZE prefix is observed in + // any position. + if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) || + (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) || + (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) { + // Make sure we observed the prefixes in any position. + if (insn->hasAdSize) + attrMask |= ATTR_ADSIZE; + if (insn->hasOpSize) + attrMask |= ATTR_OPSIZE; + + // In 16-bit, invert the attributes. + if (insn->mode == MODE_16BIT) { + attrMask ^= ATTR_ADSIZE; + + // The OpSize attribute is only valid with the absolute moves. + if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) + attrMask ^= ATTR_OPSIZE; + } + + if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + insn->instructionID = instructionID; + insn->spec = &INSTRUCTIONS_SYM[instructionID]; + return 0; + } + + if ((insn->mode == MODE_16BIT || insn->hasOpSize) && + !(attrMask & ATTR_OPSIZE)) { + // The instruction tables make no distinction between instructions that + // allow OpSize anywhere (i.e., 16-bit operations) and that need it in a + // particular spot (i.e., many MMX operations). In general we're + // conservative, but in the specific case where OpSize is present but not in + // the right place we check if there's a 16-bit operation. + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithOpsize; + llvm::StringRef specName, specWithOpSizeName; + + spec = &INSTRUCTIONS_SYM[instructionID]; + + if (getInstructionIDWithAttrMask(&instructionIDWithOpsize, insn, + attrMask | ATTR_OPSIZE)) { + // ModRM required with OpSize but not present. Give up and return the + // version without OpSize set. + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specName = mii->getName(instructionID); + specWithOpSizeName = mii->getName(instructionIDWithOpsize); + + if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) && + (insn->mode == MODE_16BIT) ^ insn->hasOpSize) { + insn->instructionID = instructionIDWithOpsize; + insn->spec = &INSTRUCTIONS_SYM[instructionIDWithOpsize]; + } else { + insn->instructionID = instructionID; + insn->spec = spec; + } + return 0; + } + + if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 && + insn->rexPrefix & 0x01) { + // NOOP shouldn't decode as NOOP if REX.b is set. Instead it should decode + // as XCHG %r8, %eax. + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithNewOpcode; + const struct InstructionSpecifier *specWithNewOpcode; + + spec = &INSTRUCTIONS_SYM[instructionID]; + + // Borrow opcode from one of the other XCHGar opcodes + insn->opcode = 0x91; + + if (getInstructionIDWithAttrMask(&instructionIDWithNewOpcode, insn, + attrMask)) { + insn->opcode = 0x90; + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithNewOpcode = &INSTRUCTIONS_SYM[instructionIDWithNewOpcode]; + + // Change back + insn->opcode = 0x90; + + insn->instructionID = instructionIDWithNewOpcode; + insn->spec = specWithNewOpcode; + + return 0; + } + + insn->instructionID = instructionID; + insn->spec = &INSTRUCTIONS_SYM[insn->instructionID]; + + return 0; +} + +// Read an operand from the opcode field of an instruction and interprets it +// appropriately given the operand width. Handles AddRegFrm instructions. +// +// @param insn - the instruction whose opcode field is to be read. +// @param size - The width (in bytes) of the register being specified. +// 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means +// RAX. +// @return - 0 on success; nonzero otherwise. +static int readOpcodeRegister(struct InternalInstruction *insn, uint8_t size) { + LLVM_DEBUG(dbgs() << "readOpcodeRegister()"); + + if (size == 0) + size = insn->registerSize; + + switch (size) { + case 1: + insn->opcodeRegister = (Reg)( + MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7))); + if (insn->rexPrefix && insn->opcodeRegister >= MODRM_REG_AL + 0x4 && + insn->opcodeRegister < MODRM_REG_AL + 0x8) { + insn->opcodeRegister = + (Reg)(MODRM_REG_SPL + (insn->opcodeRegister - MODRM_REG_AL - 4)); + } + + break; + case 2: + insn->opcodeRegister = (Reg)( + MODRM_REG_AX + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7))); + break; + case 4: + insn->opcodeRegister = + (Reg)(MODRM_REG_EAX + + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7))); + break; + case 8: + insn->opcodeRegister = + (Reg)(MODRM_REG_RAX + + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7))); + break; + } + + return 0; +} + +// Consume an immediate operand from an instruction, given the desired operand +// size. +// +// @param insn - The instruction whose operand is to be read. +// @param size - The width (in bytes) of the operand. +// @return - 0 if the immediate was successfully consumed; nonzero +// otherwise. +static int readImmediate(struct InternalInstruction *insn, uint8_t size) { + uint8_t imm8; + uint16_t imm16; + uint32_t imm32; + uint64_t imm64; + + LLVM_DEBUG(dbgs() << "readImmediate()"); + + assert(insn->numImmediatesConsumed < 2 && "Already consumed two immediates"); + + insn->immediateSize = size; + insn->immediateOffset = insn->readerCursor - insn->startLocation; + + switch (size) { + case 1: + if (consume(insn, imm8)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm8; + break; + case 2: + if (consume(insn, imm16)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm16; + break; + case 4: + if (consume(insn, imm32)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm32; + break; + case 8: + if (consume(insn, imm64)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm64; + break; + default: + llvm_unreachable("invalid size"); + } + + insn->numImmediatesConsumed++; + + return 0; +} + +// Consume vvvv from an instruction if it has a VEX prefix. +static int readVVVV(struct InternalInstruction *insn) { + LLVM_DEBUG(dbgs() << "readVVVV()"); + + int vvvv; + if (insn->vectorExtensionType == TYPE_EVEX) + vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 | + vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2])); + else if (insn->vectorExtensionType == TYPE_VEX_3B) + vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); + else if (insn->vectorExtensionType == TYPE_VEX_2B) + vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]); + else if (insn->vectorExtensionType == TYPE_XOP) + vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]); + else + return -1; + + if (insn->mode != MODE_64BIT) + vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later. + + insn->vvvv = static_cast<Reg>(vvvv); + return 0; +} + +// Read an mask register from the opcode field of an instruction. +// +// @param insn - The instruction whose opcode field is to be read. +// @return - 0 on success; nonzero otherwise. +static int readMaskRegister(struct InternalInstruction *insn) { + LLVM_DEBUG(dbgs() << "readMaskRegister()"); + + if (insn->vectorExtensionType != TYPE_EVEX) + return -1; + + insn->writemask = + static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])); + return 0; +} + +// Consults the specifier for an instruction and consumes all +// operands for that instruction, interpreting them as it goes. +static int readOperands(struct InternalInstruction *insn) { + int hasVVVV, needVVVV; + int sawRegImm = 0; + + LLVM_DEBUG(dbgs() << "readOperands()"); + + // If non-zero vvvv specified, make sure one of the operands uses it. + hasVVVV = !readVVVV(insn); + needVVVV = hasVVVV && (insn->vvvv != 0); + + for (const auto &Op : x86OperandSets[insn->spec->operands]) { + switch (Op.encoding) { + case ENCODING_NONE: + case ENCODING_SI: + case ENCODING_DI: + break; + CASE_ENCODING_VSIB: + // VSIB can use the V2 bit so check only the other bits. + if (needVVVV) + needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0); + if (readModRM(insn)) + return -1; + + // Reject if SIB wasn't used. + if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64) + return -1; + + // If sibIndex was set to SIB_INDEX_NONE, index offset is 4. + if (insn->sibIndex == SIB_INDEX_NONE) + insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4); + + // If EVEX.v2 is set this is one of the 16-31 registers. + if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT && + v2FromEVEX4of4(insn->vectorExtensionPrefix[3])) + insn->sibIndex = (SIBIndex)(insn->sibIndex + 16); + + // Adjust the index register to the correct size. + switch ((OperandType)Op.type) { + default: + debug("Unhandled VSIB index type"); + return -1; + case TYPE_MVSIBX: + insn->sibIndex = + (SIBIndex)(SIB_INDEX_XMM0 + (insn->sibIndex - insn->sibIndexBase)); + break; + case TYPE_MVSIBY: + insn->sibIndex = + (SIBIndex)(SIB_INDEX_YMM0 + (insn->sibIndex - insn->sibIndexBase)); + break; + case TYPE_MVSIBZ: + insn->sibIndex = + (SIBIndex)(SIB_INDEX_ZMM0 + (insn->sibIndex - insn->sibIndexBase)); + break; + } + + // Apply the AVX512 compressed displacement scaling factor. + if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) + insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB); + break; + case ENCODING_REG: + CASE_ENCODING_RM: + if (readModRM(insn)) + return -1; + if (fixupReg(insn, &Op)) + return -1; + // Apply the AVX512 compressed displacement scaling factor. + if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) + insn->displacement *= 1 << (Op.encoding - ENCODING_RM); + break; + case ENCODING_IB: + if (sawRegImm) { + // Saw a register immediate so don't read again and instead split the + // previous immediate. FIXME: This is a hack. + insn->immediates[insn->numImmediatesConsumed] = + insn->immediates[insn->numImmediatesConsumed - 1] & 0xf; + ++insn->numImmediatesConsumed; + break; + } + if (readImmediate(insn, 1)) + return -1; + if (Op.type == TYPE_XMM || Op.type == TYPE_YMM) + sawRegImm = 1; + break; + case ENCODING_IW: + if (readImmediate(insn, 2)) + return -1; + break; + case ENCODING_ID: + if (readImmediate(insn, 4)) + return -1; + break; + case ENCODING_IO: + if (readImmediate(insn, 8)) + return -1; + break; + case ENCODING_Iv: + if (readImmediate(insn, insn->immediateSize)) + return -1; + break; + case ENCODING_Ia: + if (readImmediate(insn, insn->addressSize)) + return -1; + break; + case ENCODING_IRC: + insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) | + lFromEVEX4of4(insn->vectorExtensionPrefix[3]); + break; + case ENCODING_RB: + if (readOpcodeRegister(insn, 1)) + return -1; + break; + case ENCODING_RW: + if (readOpcodeRegister(insn, 2)) + return -1; + break; + case ENCODING_RD: + if (readOpcodeRegister(insn, 4)) + return -1; + break; + case ENCODING_RO: + if (readOpcodeRegister(insn, 8)) + return -1; + break; + case ENCODING_Rv: + if (readOpcodeRegister(insn, 0)) + return -1; + break; + case ENCODING_CC: + insn->immediates[1] = insn->opcode & 0xf; + break; + case ENCODING_FP: + break; + case ENCODING_VVVV: + needVVVV = 0; // Mark that we have found a VVVV operand. + if (!hasVVVV) + return -1; + if (insn->mode != MODE_64BIT) + insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7); + if (fixupReg(insn, &Op)) + return -1; + break; + case ENCODING_WRITEMASK: + if (readMaskRegister(insn)) + return -1; + break; + case ENCODING_DUP: + break; + default: + LLVM_DEBUG(dbgs() << "Encountered an operand with an unknown encoding."); + return -1; + } + } + + // If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail + if (needVVVV) + return -1; + + return 0; +} namespace llvm { -// Fill-ins to make the compiler happy. These constants are never actually -// assigned; they are just filler to make an automatically-generated switch -// statement work. +// Fill-ins to make the compiler happy. These constants are never actually +// assigned; they are just filler to make an automatically-generated switch +// statement work. namespace X86 { enum { BX_SI = 500, @@ -140,7 +1669,6 @@ public: public: DecodeStatus getInstruction(MCInst &instr, uint64_t &size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &vStream, raw_ostream &cStream) const override; private: @@ -169,91 +1697,51 @@ X86GenericDisassembler::X86GenericDisassembler( llvm_unreachable("Invalid CPU mode"); } -namespace { -struct Region { - ArrayRef<uint8_t> Bytes; - uint64_t Base; - Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} -}; -} // end anonymous namespace - -/// A callback function that wraps the readByte method from Region. -/// -/// @param Arg - The generic callback parameter. In this case, this should -/// be a pointer to a Region. -/// @param Byte - A pointer to the byte to be read. -/// @param Address - The address to be read. -static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) { - auto *R = static_cast<const Region *>(Arg); - ArrayRef<uint8_t> Bytes = R->Bytes; - unsigned Index = Address - R->Base; - if (Bytes.size() <= Index) - return -1; - *Byte = Bytes[Index]; - return 0; -} - -/// logger - a callback function that wraps the operator<< method from -/// raw_ostream. -/// -/// @param arg - The generic callback parameter. This should be a pointe -/// to a raw_ostream. -/// @param log - A string to be logged. logger() adds a newline. -static void logger(void* arg, const char* log) { - if (!arg) - return; - - raw_ostream &vStream = *(static_cast<raw_ostream*>(arg)); - vStream << log << "\n"; -} - -// -// Public interface for the disassembler -// - MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, - raw_ostream &VStream, raw_ostream &CStream) const { + raw_ostream &CStream) const { CommentStream = &CStream; - InternalInstruction InternalInstr; - - dlog_t LoggerFn = logger; - if (&VStream == &nulls()) - LoggerFn = nullptr; // Disable logging completely if it's going to nulls(). - - Region R(Bytes, Address); - - int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R, - LoggerFn, (void *)&VStream, - (const void *)MII.get(), Address, fMode); - - if (Ret) { - Size = InternalInstr.readerCursor - Address; + InternalInstruction Insn; + memset(&Insn, 0, sizeof(InternalInstruction)); + Insn.bytes = Bytes; + Insn.startLocation = Address; + Insn.readerCursor = Address; + Insn.mode = fMode; + + if (Bytes.empty() || readPrefixes(&Insn) || readOpcode(&Insn) || + getInstructionID(&Insn, MII.get()) || Insn.instructionID == 0 || + readOperands(&Insn)) { + Size = Insn.readerCursor - Address; return Fail; - } else { - Size = InternalInstr.length; - bool Ret = translateInstruction(Instr, InternalInstr, this); - if (!Ret) { - unsigned Flags = X86::IP_NO_PREFIX; - if (InternalInstr.hasAdSize) - Flags |= X86::IP_HAS_AD_SIZE; - if (!InternalInstr.mandatoryPrefix) { - if (InternalInstr.hasOpSize) - Flags |= X86::IP_HAS_OP_SIZE; - if (InternalInstr.repeatPrefix == 0xf2) - Flags |= X86::IP_HAS_REPEAT_NE; - else if (InternalInstr.repeatPrefix == 0xf3 && - // It should not be 'pause' f3 90 - InternalInstr.opcode != 0x90) - Flags |= X86::IP_HAS_REPEAT; - if (InternalInstr.hasLockPrefix) - Flags |= X86::IP_HAS_LOCK; - } - Instr.setFlags(Flags); + } + + Insn.operands = x86OperandSets[Insn.spec->operands]; + Insn.length = Insn.readerCursor - Insn.startLocation; + Size = Insn.length; + if (Size > 15) + LLVM_DEBUG(dbgs() << "Instruction exceeds 15-byte limit"); + + bool Ret = translateInstruction(Instr, Insn, this); + if (!Ret) { + unsigned Flags = X86::IP_NO_PREFIX; + if (Insn.hasAdSize) + Flags |= X86::IP_HAS_AD_SIZE; + if (!Insn.mandatoryPrefix) { + if (Insn.hasOpSize) + Flags |= X86::IP_HAS_OP_SIZE; + if (Insn.repeatPrefix == 0xf2) + Flags |= X86::IP_HAS_REPEAT_NE; + else if (Insn.repeatPrefix == 0xf3 && + // It should not be 'pause' f3 90 + Insn.opcode != 0x90) + Flags |= X86::IP_HAS_REPEAT; + if (Insn.hasLockPrefix) + Flags |= X86::IP_HAS_LOCK; } - return (!Ret) ? Success : Fail; + Instr.setFlags(Flags); } + return (!Ret) ? Success : Fail; } // @@ -844,7 +2332,7 @@ static MCDisassembler *createX86Disassembler(const Target &T, return new X86GenericDisassembler(STI, Ctx, std::move(MII)); } -extern "C" void LLVMInitializeX86Disassembler() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Disassembler() { // Register the disassembler. TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(), createX86Disassembler); diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp deleted file mode 100644 index e287f6625115..000000000000 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ /dev/null @@ -1,1938 +0,0 @@ -//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is part of the X86 Disassembler. -// It contains the implementation of the instruction decoder. -// Documentation for the disassembler can be found in X86Disassembler.h. -// -//===----------------------------------------------------------------------===// - -#include "X86DisassemblerDecoder.h" -#include "llvm/ADT/StringRef.h" - -#include <cstdarg> /* for va_*() */ -#include <cstdio> /* for vsnprintf() */ -#include <cstdlib> /* for exit() */ -#include <cstring> /* for memset() */ - -using namespace llvm::X86Disassembler; - -/// Specifies whether a ModR/M byte is needed and (if so) which -/// instruction each possible value of the ModR/M byte corresponds to. Once -/// this information is known, we have narrowed down to a single instruction. -struct ModRMDecision { - uint8_t modrm_type; - uint16_t instructionIDs; -}; - -/// Specifies which set of ModR/M->instruction tables to look at -/// given a particular opcode. -struct OpcodeDecision { - ModRMDecision modRMDecisions[256]; -}; - -/// Specifies which opcode->instruction tables to look at given -/// a particular context (set of attributes). Since there are many possible -/// contexts, the decoder first uses CONTEXTS_SYM to determine which context -/// applies given a specific set of attributes. Hence there are only IC_max -/// entries in this table, rather than 2^(ATTR_max). -struct ContextDecision { - OpcodeDecision opcodeDecisions[IC_max]; -}; - -#include "X86GenDisassemblerTables.inc" - -#ifndef NDEBUG -#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0) -#else -#define debug(s) do { } while (0) -#endif - -/* - * contextForAttrs - Client for the instruction context table. Takes a set of - * attributes and returns the appropriate decode context. - * - * @param attrMask - Attributes, from the enumeration attributeBits. - * @return - The InstructionContext to use when looking up an - * an instruction with these attributes. - */ -static InstructionContext contextForAttrs(uint16_t attrMask) { - return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]); -} - -/* - * modRMRequired - Reads the appropriate instruction table to determine whether - * the ModR/M byte is required to decode a particular instruction. - * - * @param type - The opcode type (i.e., how many bytes it has). - * @param insnContext - The context for the instruction, as returned by - * contextForAttrs. - * @param opcode - The last byte of the instruction's opcode, not counting - * ModR/M extensions and escapes. - * @return - true if the ModR/M byte is required, false otherwise. - */ -static int modRMRequired(OpcodeType type, - InstructionContext insnContext, - uint16_t opcode) { - const struct ContextDecision* decision = nullptr; - - switch (type) { - case ONEBYTE: - decision = &ONEBYTE_SYM; - break; - case TWOBYTE: - decision = &TWOBYTE_SYM; - break; - case THREEBYTE_38: - decision = &THREEBYTE38_SYM; - break; - case THREEBYTE_3A: - decision = &THREEBYTE3A_SYM; - break; - case XOP8_MAP: - decision = &XOP8_MAP_SYM; - break; - case XOP9_MAP: - decision = &XOP9_MAP_SYM; - break; - case XOPA_MAP: - decision = &XOPA_MAP_SYM; - break; - case THREEDNOW_MAP: - decision = &THREEDNOW_MAP_SYM; - break; - } - - return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. - modrm_type != MODRM_ONEENTRY; -} - -/* - * decode - Reads the appropriate instruction table to obtain the unique ID of - * an instruction. - * - * @param type - See modRMRequired(). - * @param insnContext - See modRMRequired(). - * @param opcode - See modRMRequired(). - * @param modRM - The ModR/M byte if required, or any value if not. - * @return - The UID of the instruction, or 0 on failure. - */ -static InstrUID decode(OpcodeType type, - InstructionContext insnContext, - uint8_t opcode, - uint8_t modRM) { - const struct ModRMDecision* dec = nullptr; - - switch (type) { - case ONEBYTE: - dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case TWOBYTE: - dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case THREEBYTE_38: - dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case THREEBYTE_3A: - dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case XOP8_MAP: - dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case XOP9_MAP: - dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case XOPA_MAP: - dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - case THREEDNOW_MAP: - dec = &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; - break; - } - - switch (dec->modrm_type) { - default: - debug("Corrupt table! Unknown modrm_type"); - return 0; - case MODRM_ONEENTRY: - return modRMTable[dec->instructionIDs]; - case MODRM_SPLITRM: - if (modFromModRM(modRM) == 0x3) - return modRMTable[dec->instructionIDs+1]; - return modRMTable[dec->instructionIDs]; - case MODRM_SPLITREG: - if (modFromModRM(modRM) == 0x3) - return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8]; - return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; - case MODRM_SPLITMISC: - if (modFromModRM(modRM) == 0x3) - return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8]; - return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)]; - case MODRM_FULL: - return modRMTable[dec->instructionIDs+modRM]; - } -} - -/* - * specifierForUID - Given a UID, returns the name and operand specification for - * that instruction. - * - * @param uid - The unique ID for the instruction. This should be returned by - * decode(); specifierForUID will not check bounds. - * @return - A pointer to the specification for that instruction. - */ -static const struct InstructionSpecifier *specifierForUID(InstrUID uid) { - return &INSTRUCTIONS_SYM[uid]; -} - -/* - * consumeByte - Uses the reader function provided by the user to consume one - * byte from the instruction's memory and advance the cursor. - * - * @param insn - The instruction with the reader function to use. The cursor - * for this instruction is advanced. - * @param byte - A pointer to a pre-allocated memory buffer to be populated - * with the data read. - * @return - 0 if the read was successful; nonzero otherwise. - */ -static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { - int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); - - if (!ret) - ++(insn->readerCursor); - - return ret; -} - -/* - * lookAtByte - Like consumeByte, but does not advance the cursor. - * - * @param insn - See consumeByte(). - * @param byte - See consumeByte(). - * @return - See consumeByte(). - */ -static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { - return insn->reader(insn->readerArg, byte, insn->readerCursor); -} - -static void unconsumeByte(struct InternalInstruction* insn) { - insn->readerCursor--; -} - -#define CONSUME_FUNC(name, type) \ - static int name(struct InternalInstruction* insn, type* ptr) { \ - type combined = 0; \ - unsigned offset; \ - for (offset = 0; offset < sizeof(type); ++offset) { \ - uint8_t byte; \ - int ret = insn->reader(insn->readerArg, \ - &byte, \ - insn->readerCursor + offset); \ - if (ret) \ - return ret; \ - combined = combined | ((uint64_t)byte << (offset * 8)); \ - } \ - *ptr = combined; \ - insn->readerCursor += sizeof(type); \ - return 0; \ - } - -/* - * consume* - Use the reader function provided by the user to consume data - * values of various sizes from the instruction's memory and advance the - * cursor appropriately. These readers perform endian conversion. - * - * @param insn - See consumeByte(). - * @param ptr - A pointer to a pre-allocated memory of appropriate size to - * be populated with the data read. - * @return - See consumeByte(). - */ -CONSUME_FUNC(consumeInt8, int8_t) -CONSUME_FUNC(consumeInt16, int16_t) -CONSUME_FUNC(consumeInt32, int32_t) -CONSUME_FUNC(consumeUInt16, uint16_t) -CONSUME_FUNC(consumeUInt32, uint32_t) -CONSUME_FUNC(consumeUInt64, uint64_t) - -/* - * dbgprintf - Uses the logging function provided by the user to log a single - * message, typically without a carriage-return. - * - * @param insn - The instruction containing the logging function. - * @param format - See printf(). - * @param ... - See printf(). - */ -static void dbgprintf(struct InternalInstruction* insn, - const char* format, - ...) { - char buffer[256]; - va_list ap; - - if (!insn->dlog) - return; - - va_start(ap, format); - (void)vsnprintf(buffer, sizeof(buffer), format, ap); - va_end(ap); - - insn->dlog(insn->dlogArg, buffer); -} - -static bool isREX(struct InternalInstruction *insn, uint8_t prefix) { - if (insn->mode == MODE_64BIT) - return prefix >= 0x40 && prefix <= 0x4f; - return false; -} - -/* - * setPrefixPresent - Marks that a particular prefix is present as mandatory - * - * @param insn - The instruction to be marked as having the prefix. - * @param prefix - The prefix that is present. - */ -static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) { - uint8_t nextByte; - switch (prefix) { - case 0xf0: - insn->hasLockPrefix = true; - break; - case 0xf2: - case 0xf3: - if (lookAtByte(insn, &nextByte)) - break; - // TODO: - // 1. There could be several 0x66 - // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then - // it's not mandatory prefix - // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need - // 0x0f exactly after it to be mandatory prefix - if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66) - // The last of 0xf2 /0xf3 is mandatory prefix - insn->mandatoryPrefix = prefix; - insn->repeatPrefix = prefix; - break; - case 0x66: - if (lookAtByte(insn, &nextByte)) - break; - // 0x66 can't overwrite existing mandatory prefix and should be ignored - if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte))) - insn->mandatoryPrefix = prefix; - break; - } -} - -/* - * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the - * instruction as having them. Also sets the instruction's default operand, - * address, and other relevant data sizes to report operands correctly. - * - * @param insn - The instruction whose prefixes are to be read. - * @return - 0 if the instruction could be read until the end of the prefix - * bytes, and no prefixes conflicted; nonzero otherwise. - */ -static int readPrefixes(struct InternalInstruction* insn) { - bool isPrefix = true; - uint8_t byte = 0; - uint8_t nextByte; - - dbgprintf(insn, "readPrefixes()"); - - while (isPrefix) { - /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */ - if (consumeByte(insn, &byte)) - break; - - /* - * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then - * break and let it be disassembled as a normal "instruction". - */ - if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK - break; - - if ((byte == 0xf2 || byte == 0xf3) && !lookAtByte(insn, &nextByte)) { - /* - * If the byte is 0xf2 or 0xf3, and any of the following conditions are - * met: - * - it is followed by a LOCK (0xf0) prefix - * - it is followed by an xchg instruction - * then it should be disassembled as a xacquire/xrelease not repne/rep. - */ - if (((nextByte == 0xf0) || - ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) { - insn->xAcquireRelease = true; - if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support - break; - } - /* - * Also if the byte is 0xf3, and the following condition is met: - * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or - * "mov mem, imm" (opcode 0xc6/0xc7) instructions. - * then it should be disassembled as an xrelease not rep. - */ - if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 || - nextByte == 0xc6 || nextByte == 0xc7)) { - insn->xAcquireRelease = true; - break; - } - if (isREX(insn, nextByte)) { - uint8_t nnextByte; - // Go to REX prefix after the current one - if (consumeByte(insn, &nnextByte)) - return -1; - // We should be able to read next byte after REX prefix - if (lookAtByte(insn, &nnextByte)) - return -1; - unconsumeByte(insn); - } - } - - switch (byte) { - case 0xf0: /* LOCK */ - case 0xf2: /* REPNE/REPNZ */ - case 0xf3: /* REP or REPE/REPZ */ - setPrefixPresent(insn, byte); - break; - case 0x2e: /* CS segment override -OR- Branch not taken */ - case 0x36: /* SS segment override -OR- Branch taken */ - case 0x3e: /* DS segment override */ - case 0x26: /* ES segment override */ - case 0x64: /* FS segment override */ - case 0x65: /* GS segment override */ - switch (byte) { - case 0x2e: - insn->segmentOverride = SEG_OVERRIDE_CS; - break; - case 0x36: - insn->segmentOverride = SEG_OVERRIDE_SS; - break; - case 0x3e: - insn->segmentOverride = SEG_OVERRIDE_DS; - break; - case 0x26: - insn->segmentOverride = SEG_OVERRIDE_ES; - break; - case 0x64: - insn->segmentOverride = SEG_OVERRIDE_FS; - break; - case 0x65: - insn->segmentOverride = SEG_OVERRIDE_GS; - break; - default: - debug("Unhandled override"); - return -1; - } - setPrefixPresent(insn, byte); - break; - case 0x66: /* Operand-size override */ - insn->hasOpSize = true; - setPrefixPresent(insn, byte); - break; - case 0x67: /* Address-size override */ - insn->hasAdSize = true; - setPrefixPresent(insn, byte); - break; - default: /* Not a prefix byte */ - isPrefix = false; - break; - } - - if (isPrefix) - dbgprintf(insn, "Found prefix 0x%hhx", byte); - } - - insn->vectorExtensionType = TYPE_NO_VEX_XOP; - - if (byte == 0x62) { - uint8_t byte1, byte2; - - if (consumeByte(insn, &byte1)) { - dbgprintf(insn, "Couldn't read second byte of EVEX prefix"); - return -1; - } - - if (lookAtByte(insn, &byte2)) { - dbgprintf(insn, "Couldn't read third byte of EVEX prefix"); - return -1; - } - - if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) && - ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) { - insn->vectorExtensionType = TYPE_EVEX; - } else { - unconsumeByte(insn); /* unconsume byte1 */ - unconsumeByte(insn); /* unconsume byte */ - } - - if (insn->vectorExtensionType == TYPE_EVEX) { - insn->vectorExtensionPrefix[0] = byte; - insn->vectorExtensionPrefix[1] = byte1; - if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) { - dbgprintf(insn, "Couldn't read third byte of EVEX prefix"); - return -1; - } - if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) { - dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix"); - return -1; - } - - /* We simulate the REX prefix for simplicity's sake */ - if (insn->mode == MODE_64BIT) { - insn->rexPrefix = 0x40 - | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3) - | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2) - | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1) - | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0); - } - - dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx", - insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], - insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]); - } - } else if (byte == 0xc4) { - uint8_t byte1; - - if (lookAtByte(insn, &byte1)) { - dbgprintf(insn, "Couldn't read second byte of VEX"); - return -1; - } - - if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) - insn->vectorExtensionType = TYPE_VEX_3B; - else - unconsumeByte(insn); - - if (insn->vectorExtensionType == TYPE_VEX_3B) { - insn->vectorExtensionPrefix[0] = byte; - consumeByte(insn, &insn->vectorExtensionPrefix[1]); - consumeByte(insn, &insn->vectorExtensionPrefix[2]); - - /* We simulate the REX prefix for simplicity's sake */ - - if (insn->mode == MODE_64BIT) - insn->rexPrefix = 0x40 - | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) - | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) - | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) - | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0); - - dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", - insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], - insn->vectorExtensionPrefix[2]); - } - } else if (byte == 0xc5) { - uint8_t byte1; - - if (lookAtByte(insn, &byte1)) { - dbgprintf(insn, "Couldn't read second byte of VEX"); - return -1; - } - - if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) - insn->vectorExtensionType = TYPE_VEX_2B; - else - unconsumeByte(insn); - - if (insn->vectorExtensionType == TYPE_VEX_2B) { - insn->vectorExtensionPrefix[0] = byte; - consumeByte(insn, &insn->vectorExtensionPrefix[1]); - - if (insn->mode == MODE_64BIT) - insn->rexPrefix = 0x40 - | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2); - - switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { - default: - break; - case VEX_PREFIX_66: - insn->hasOpSize = true; - break; - } - - dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", - insn->vectorExtensionPrefix[0], - insn->vectorExtensionPrefix[1]); - } - } else if (byte == 0x8f) { - uint8_t byte1; - - if (lookAtByte(insn, &byte1)) { - dbgprintf(insn, "Couldn't read second byte of XOP"); - return -1; - } - - if ((byte1 & 0x38) != 0x0) /* 0 in these 3 bits is a POP instruction. */ - insn->vectorExtensionType = TYPE_XOP; - else - unconsumeByte(insn); - - if (insn->vectorExtensionType == TYPE_XOP) { - insn->vectorExtensionPrefix[0] = byte; - consumeByte(insn, &insn->vectorExtensionPrefix[1]); - consumeByte(insn, &insn->vectorExtensionPrefix[2]); - - /* We simulate the REX prefix for simplicity's sake */ - - if (insn->mode == MODE_64BIT) - insn->rexPrefix = 0x40 - | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) - | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) - | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) - | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0); - - switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { - default: - break; - case VEX_PREFIX_66: - insn->hasOpSize = true; - break; - } - - dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx", - insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], - insn->vectorExtensionPrefix[2]); - } - } else if (isREX(insn, byte)) { - if (lookAtByte(insn, &nextByte)) - return -1; - insn->rexPrefix = byte; - dbgprintf(insn, "Found REX prefix 0x%hhx", byte); - } else - unconsumeByte(insn); - - if (insn->mode == MODE_16BIT) { - insn->registerSize = (insn->hasOpSize ? 4 : 2); - insn->addressSize = (insn->hasAdSize ? 4 : 2); - insn->displacementSize = (insn->hasAdSize ? 4 : 2); - insn->immediateSize = (insn->hasOpSize ? 4 : 2); - } else if (insn->mode == MODE_32BIT) { - insn->registerSize = (insn->hasOpSize ? 2 : 4); - insn->addressSize = (insn->hasAdSize ? 2 : 4); - insn->displacementSize = (insn->hasAdSize ? 2 : 4); - insn->immediateSize = (insn->hasOpSize ? 2 : 4); - } else if (insn->mode == MODE_64BIT) { - if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { - insn->registerSize = 8; - insn->addressSize = (insn->hasAdSize ? 4 : 8); - insn->displacementSize = 4; - insn->immediateSize = 4; - } else { - insn->registerSize = (insn->hasOpSize ? 2 : 4); - insn->addressSize = (insn->hasAdSize ? 4 : 8); - insn->displacementSize = (insn->hasOpSize ? 2 : 4); - insn->immediateSize = (insn->hasOpSize ? 2 : 4); - } - } - - return 0; -} - -static int readModRM(struct InternalInstruction* insn); - -/* - * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of - * extended or escape opcodes). - * - * @param insn - The instruction whose opcode is to be read. - * @return - 0 if the opcode could be read successfully; nonzero otherwise. - */ -static int readOpcode(struct InternalInstruction* insn) { - /* Determine the length of the primary opcode */ - - uint8_t current; - - dbgprintf(insn, "readOpcode()"); - - insn->opcodeType = ONEBYTE; - - if (insn->vectorExtensionType == TYPE_EVEX) { - switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { - default: - dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)", - mmFromEVEX2of4(insn->vectorExtensionPrefix[1])); - return -1; - case VEX_LOB_0F: - insn->opcodeType = TWOBYTE; - return consumeByte(insn, &insn->opcode); - case VEX_LOB_0F38: - insn->opcodeType = THREEBYTE_38; - return consumeByte(insn, &insn->opcode); - case VEX_LOB_0F3A: - insn->opcodeType = THREEBYTE_3A; - return consumeByte(insn, &insn->opcode); - } - } else if (insn->vectorExtensionType == TYPE_VEX_3B) { - switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) { - default: - dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", - mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])); - return -1; - case VEX_LOB_0F: - insn->opcodeType = TWOBYTE; - return consumeByte(insn, &insn->opcode); - case VEX_LOB_0F38: - insn->opcodeType = THREEBYTE_38; - return consumeByte(insn, &insn->opcode); - case VEX_LOB_0F3A: - insn->opcodeType = THREEBYTE_3A; - return consumeByte(insn, &insn->opcode); - } - } else if (insn->vectorExtensionType == TYPE_VEX_2B) { - insn->opcodeType = TWOBYTE; - return consumeByte(insn, &insn->opcode); - } else if (insn->vectorExtensionType == TYPE_XOP) { - switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) { - default: - dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", - mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])); - return -1; - case XOP_MAP_SELECT_8: - insn->opcodeType = XOP8_MAP; - return consumeByte(insn, &insn->opcode); - case XOP_MAP_SELECT_9: - insn->opcodeType = XOP9_MAP; - return consumeByte(insn, &insn->opcode); - case XOP_MAP_SELECT_A: - insn->opcodeType = XOPA_MAP; - return consumeByte(insn, &insn->opcode); - } - } - - if (consumeByte(insn, ¤t)) - return -1; - - if (current == 0x0f) { - dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); - - if (consumeByte(insn, ¤t)) - return -1; - - if (current == 0x38) { - dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - - if (consumeByte(insn, ¤t)) - return -1; - - insn->opcodeType = THREEBYTE_38; - } else if (current == 0x3a) { - dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); - - if (consumeByte(insn, ¤t)) - return -1; - - insn->opcodeType = THREEBYTE_3A; - } else if (current == 0x0f) { - dbgprintf(insn, "Found a 3dnow escape prefix (0x%hhx)", current); - - // Consume operands before the opcode to comply with the 3DNow encoding - if (readModRM(insn)) - return -1; - - if (consumeByte(insn, ¤t)) - return -1; - - insn->opcodeType = THREEDNOW_MAP; - } else { - dbgprintf(insn, "Didn't find a three-byte escape prefix"); - - insn->opcodeType = TWOBYTE; - } - } else if (insn->mandatoryPrefix) - // The opcode with mandatory prefix must start with opcode escape. - // If not it's legacy repeat prefix - insn->mandatoryPrefix = 0; - - /* - * At this point we have consumed the full opcode. - * Anything we consume from here on must be unconsumed. - */ - - insn->opcode = current; - - return 0; -} - -/* - * getIDWithAttrMask - Determines the ID of an instruction, consuming - * the ModR/M byte as appropriate for extended and escape opcodes, - * and using a supplied attribute mask. - * - * @param instructionID - A pointer whose target is filled in with the ID of the - * instruction. - * @param insn - The instruction whose ID is to be determined. - * @param attrMask - The attribute mask to search. - * @return - 0 if the ModR/M could be read when needed or was not - * needed; nonzero otherwise. - */ -static int getIDWithAttrMask(uint16_t* instructionID, - struct InternalInstruction* insn, - uint16_t attrMask) { - bool hasModRMExtension; - - InstructionContext instructionClass = contextForAttrs(attrMask); - - hasModRMExtension = modRMRequired(insn->opcodeType, - instructionClass, - insn->opcode); - - if (hasModRMExtension) { - if (readModRM(insn)) - return -1; - - *instructionID = decode(insn->opcodeType, - instructionClass, - insn->opcode, - insn->modRM); - } else { - *instructionID = decode(insn->opcodeType, - instructionClass, - insn->opcode, - 0); - } - - return 0; -} - -/* - * is16BitEquivalent - Determines whether two instruction names refer to - * equivalent instructions but one is 16-bit whereas the other is not. - * - * @param orig - The instruction that is not 16-bit - * @param equiv - The instruction that is 16-bit - */ -static bool is16BitEquivalent(const char *orig, const char *equiv) { - off_t i; - - for (i = 0;; i++) { - if (orig[i] == '\0' && equiv[i] == '\0') - return true; - if (orig[i] == '\0' || equiv[i] == '\0') - return false; - if (orig[i] != equiv[i]) { - if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') - continue; - if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') - continue; - if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') - continue; - return false; - } - } -} - -/* - * is64Bit - Determines whether this instruction is a 64-bit instruction. - * - * @param name - The instruction that is not 16-bit - */ -static bool is64Bit(const char *name) { - off_t i; - - for (i = 0;; ++i) { - if (name[i] == '\0') - return false; - if (name[i] == '6' && name[i+1] == '4') - return true; - } -} - -/* - * getID - Determines the ID of an instruction, consuming the ModR/M byte as - * appropriate for extended and escape opcodes. Determines the attributes and - * context for the instruction before doing so. - * - * @param insn - The instruction whose ID is to be determined. - * @return - 0 if the ModR/M could be read when needed or was not needed; - * nonzero otherwise. - */ -static int getID(struct InternalInstruction* insn, const void *miiArg) { - uint16_t attrMask; - uint16_t instructionID; - - dbgprintf(insn, "getID()"); - - attrMask = ATTR_NONE; - - if (insn->mode == MODE_64BIT) - attrMask |= ATTR_64BIT; - - if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) { - attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX; - - if (insn->vectorExtensionType == TYPE_EVEX) { - switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) { - case VEX_PREFIX_66: - attrMask |= ATTR_OPSIZE; - break; - case VEX_PREFIX_F3: - attrMask |= ATTR_XS; - break; - case VEX_PREFIX_F2: - attrMask |= ATTR_XD; - break; - } - - if (zFromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_EVEXKZ; - if (bFromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_EVEXB; - if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_EVEXK; - if (lFromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_VEXL; - if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) - attrMask |= ATTR_EVEXL2; - } else if (insn->vectorExtensionType == TYPE_VEX_3B) { - switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) { - case VEX_PREFIX_66: - attrMask |= ATTR_OPSIZE; - break; - case VEX_PREFIX_F3: - attrMask |= ATTR_XS; - break; - case VEX_PREFIX_F2: - attrMask |= ATTR_XD; - break; - } - - if (lFromVEX3of3(insn->vectorExtensionPrefix[2])) - attrMask |= ATTR_VEXL; - } else if (insn->vectorExtensionType == TYPE_VEX_2B) { - switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { - case VEX_PREFIX_66: - attrMask |= ATTR_OPSIZE; - break; - case VEX_PREFIX_F3: - attrMask |= ATTR_XS; - break; - case VEX_PREFIX_F2: - attrMask |= ATTR_XD; - break; - } - - if (lFromVEX2of2(insn->vectorExtensionPrefix[1])) - attrMask |= ATTR_VEXL; - } else if (insn->vectorExtensionType == TYPE_XOP) { - switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { - case VEX_PREFIX_66: - attrMask |= ATTR_OPSIZE; - break; - case VEX_PREFIX_F3: - attrMask |= ATTR_XS; - break; - case VEX_PREFIX_F2: - attrMask |= ATTR_XD; - break; - } - - if (lFromXOP3of3(insn->vectorExtensionPrefix[2])) - attrMask |= ATTR_VEXL; - } else { - return -1; - } - } else if (!insn->mandatoryPrefix) { - // If we don't have mandatory prefix we should use legacy prefixes here - if (insn->hasOpSize && (insn->mode != MODE_16BIT)) - attrMask |= ATTR_OPSIZE; - if (insn->hasAdSize) - attrMask |= ATTR_ADSIZE; - if (insn->opcodeType == ONEBYTE) { - if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90)) - // Special support for PAUSE - attrMask |= ATTR_XS; - } else { - if (insn->repeatPrefix == 0xf2) - attrMask |= ATTR_XD; - else if (insn->repeatPrefix == 0xf3) - attrMask |= ATTR_XS; - } - } else { - switch (insn->mandatoryPrefix) { - case 0xf2: - attrMask |= ATTR_XD; - break; - case 0xf3: - attrMask |= ATTR_XS; - break; - case 0x66: - if (insn->mode != MODE_16BIT) - attrMask |= ATTR_OPSIZE; - break; - case 0x67: - attrMask |= ATTR_ADSIZE; - break; - } - - } - - if (insn->rexPrefix & 0x08) { - attrMask |= ATTR_REXW; - attrMask &= ~ATTR_ADSIZE; - } - - /* - * JCXZ/JECXZ need special handling for 16-bit mode because the meaning - * of the AdSize prefix is inverted w.r.t. 32-bit mode. - */ - if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE && - insn->opcode == 0xE3) - attrMask ^= ATTR_ADSIZE; - - // If we're in 16-bit mode and this is one of the relative jumps and opsize - // prefix isn't present, we need to force the opsize attribute since the - // prefix is inverted relative to 32-bit mode. - if (insn->mode == MODE_16BIT && !insn->hasOpSize && - insn->opcodeType == ONEBYTE && - (insn->opcode == 0xE8 || insn->opcode == 0xE9)) - attrMask |= ATTR_OPSIZE; - - if (insn->mode == MODE_16BIT && !insn->hasOpSize && - insn->opcodeType == TWOBYTE && - insn->opcode >= 0x80 && insn->opcode <= 0x8F) - attrMask |= ATTR_OPSIZE; - - if (getIDWithAttrMask(&instructionID, insn, attrMask)) - return -1; - - /* The following clauses compensate for limitations of the tables. */ - - if (insn->mode != MODE_64BIT && - insn->vectorExtensionType != TYPE_NO_VEX_XOP) { - /* - * The tables can't distinquish between cases where the W-bit is used to - * select register size and cases where its a required part of the opcode. - */ - if ((insn->vectorExtensionType == TYPE_EVEX && - wFromEVEX3of4(insn->vectorExtensionPrefix[2])) || - (insn->vectorExtensionType == TYPE_VEX_3B && - wFromVEX3of3(insn->vectorExtensionPrefix[2])) || - (insn->vectorExtensionType == TYPE_XOP && - wFromXOP3of3(insn->vectorExtensionPrefix[2]))) { - - uint16_t instructionIDWithREXW; - if (getIDWithAttrMask(&instructionIDWithREXW, - insn, attrMask | ATTR_REXW)) { - insn->instructionID = instructionID; - insn->spec = specifierForUID(instructionID); - return 0; - } - - auto SpecName = GetInstrName(instructionIDWithREXW, miiArg); - // If not a 64-bit instruction. Switch the opcode. - if (!is64Bit(SpecName.data())) { - insn->instructionID = instructionIDWithREXW; - insn->spec = specifierForUID(instructionIDWithREXW); - return 0; - } - } - } - - /* - * Absolute moves, umonitor, and movdir64b need special handling. - * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are - * inverted w.r.t. - * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in - * any position. - */ - if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) || - (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) || - (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) { - /* Make sure we observed the prefixes in any position. */ - if (insn->hasAdSize) - attrMask |= ATTR_ADSIZE; - if (insn->hasOpSize) - attrMask |= ATTR_OPSIZE; - - /* In 16-bit, invert the attributes. */ - if (insn->mode == MODE_16BIT) { - attrMask ^= ATTR_ADSIZE; - - /* The OpSize attribute is only valid with the absolute moves. */ - if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) - attrMask ^= ATTR_OPSIZE; - } - - if (getIDWithAttrMask(&instructionID, insn, attrMask)) - return -1; - - insn->instructionID = instructionID; - insn->spec = specifierForUID(instructionID); - return 0; - } - - if ((insn->mode == MODE_16BIT || insn->hasOpSize) && - !(attrMask & ATTR_OPSIZE)) { - /* - * The instruction tables make no distinction between instructions that - * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a - * particular spot (i.e., many MMX operations). In general we're - * conservative, but in the specific case where OpSize is present but not - * in the right place we check if there's a 16-bit operation. - */ - - const struct InstructionSpecifier *spec; - uint16_t instructionIDWithOpsize; - llvm::StringRef specName, specWithOpSizeName; - - spec = specifierForUID(instructionID); - - if (getIDWithAttrMask(&instructionIDWithOpsize, - insn, - attrMask | ATTR_OPSIZE)) { - /* - * ModRM required with OpSize but not present; give up and return version - * without OpSize set - */ - - insn->instructionID = instructionID; - insn->spec = spec; - return 0; - } - - specName = GetInstrName(instructionID, miiArg); - specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg); - - if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) && - (insn->mode == MODE_16BIT) ^ insn->hasOpSize) { - insn->instructionID = instructionIDWithOpsize; - insn->spec = specifierForUID(instructionIDWithOpsize); - } else { - insn->instructionID = instructionID; - insn->spec = spec; - } - return 0; - } - - if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 && - insn->rexPrefix & 0x01) { - /* - * NOOP shouldn't decode as NOOP if REX.b is set. Instead - * it should decode as XCHG %r8, %eax. - */ - - const struct InstructionSpecifier *spec; - uint16_t instructionIDWithNewOpcode; - const struct InstructionSpecifier *specWithNewOpcode; - - spec = specifierForUID(instructionID); - - /* Borrow opcode from one of the other XCHGar opcodes */ - insn->opcode = 0x91; - - if (getIDWithAttrMask(&instructionIDWithNewOpcode, - insn, - attrMask)) { - insn->opcode = 0x90; - - insn->instructionID = instructionID; - insn->spec = spec; - return 0; - } - - specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode); - - /* Change back */ - insn->opcode = 0x90; - - insn->instructionID = instructionIDWithNewOpcode; - insn->spec = specWithNewOpcode; - - return 0; - } - - insn->instructionID = instructionID; - insn->spec = specifierForUID(insn->instructionID); - - return 0; -} - -/* - * readSIB - Consumes the SIB byte to determine addressing information for an - * instruction. - * - * @param insn - The instruction whose SIB byte is to be read. - * @return - 0 if the SIB byte was successfully read; nonzero otherwise. - */ -static int readSIB(struct InternalInstruction* insn) { - SIBBase sibBaseBase = SIB_BASE_NONE; - uint8_t index, base; - - dbgprintf(insn, "readSIB()"); - - if (insn->consumedSIB) - return 0; - - insn->consumedSIB = true; - - switch (insn->addressSize) { - case 2: - dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); - return -1; - case 4: - insn->sibIndexBase = SIB_INDEX_EAX; - sibBaseBase = SIB_BASE_EAX; - break; - case 8: - insn->sibIndexBase = SIB_INDEX_RAX; - sibBaseBase = SIB_BASE_RAX; - break; - } - - if (consumeByte(insn, &insn->sib)) - return -1; - - index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); - - if (index == 0x4) { - insn->sibIndex = SIB_INDEX_NONE; - } else { - insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index); - } - - insn->sibScale = 1 << scaleFromSIB(insn->sib); - - base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); - - switch (base) { - case 0x5: - case 0xd: - switch (modFromModRM(insn->modRM)) { - case 0x0: - insn->eaDisplacement = EA_DISP_32; - insn->sibBase = SIB_BASE_NONE; - break; - case 0x1: - insn->eaDisplacement = EA_DISP_8; - insn->sibBase = (SIBBase)(sibBaseBase + base); - break; - case 0x2: - insn->eaDisplacement = EA_DISP_32; - insn->sibBase = (SIBBase)(sibBaseBase + base); - break; - case 0x3: - debug("Cannot have Mod = 0b11 and a SIB byte"); - return -1; - } - break; - default: - insn->sibBase = (SIBBase)(sibBaseBase + base); - break; - } - - return 0; -} - -/* - * readDisplacement - Consumes the displacement of an instruction. - * - * @param insn - The instruction whose displacement is to be read. - * @return - 0 if the displacement byte was successfully read; nonzero - * otherwise. - */ -static int readDisplacement(struct InternalInstruction* insn) { - int8_t d8; - int16_t d16; - int32_t d32; - - dbgprintf(insn, "readDisplacement()"); - - if (insn->consumedDisplacement) - return 0; - - insn->consumedDisplacement = true; - insn->displacementOffset = insn->readerCursor - insn->startLocation; - - switch (insn->eaDisplacement) { - case EA_DISP_NONE: - insn->consumedDisplacement = false; - break; - case EA_DISP_8: - if (consumeInt8(insn, &d8)) - return -1; - insn->displacement = d8; - break; - case EA_DISP_16: - if (consumeInt16(insn, &d16)) - return -1; - insn->displacement = d16; - break; - case EA_DISP_32: - if (consumeInt32(insn, &d32)) - return -1; - insn->displacement = d32; - break; - } - - insn->consumedDisplacement = true; - return 0; -} - -/* - * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and - * displacement) for an instruction and interprets it. - * - * @param insn - The instruction whose addressing information is to be read. - * @return - 0 if the information was successfully read; nonzero otherwise. - */ -static int readModRM(struct InternalInstruction* insn) { - uint8_t mod, rm, reg, evexrm; - - dbgprintf(insn, "readModRM()"); - - if (insn->consumedModRM) - return 0; - - if (consumeByte(insn, &insn->modRM)) - return -1; - insn->consumedModRM = true; - - mod = modFromModRM(insn->modRM); - rm = rmFromModRM(insn->modRM); - reg = regFromModRM(insn->modRM); - - /* - * This goes by insn->registerSize to pick the correct register, which messes - * up if we're using (say) XMM or 8-bit register operands. That gets fixed in - * fixupReg(). - */ - switch (insn->registerSize) { - case 2: - insn->regBase = MODRM_REG_AX; - insn->eaRegBase = EA_REG_AX; - break; - case 4: - insn->regBase = MODRM_REG_EAX; - insn->eaRegBase = EA_REG_EAX; - break; - case 8: - insn->regBase = MODRM_REG_RAX; - insn->eaRegBase = EA_REG_RAX; - break; - } - - reg |= rFromREX(insn->rexPrefix) << 3; - rm |= bFromREX(insn->rexPrefix) << 3; - - evexrm = 0; - if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) { - reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4; - evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4; - } - - insn->reg = (Reg)(insn->regBase + reg); - - switch (insn->addressSize) { - case 2: { - EABase eaBaseBase = EA_BASE_BX_SI; - - switch (mod) { - case 0x0: - if (rm == 0x6) { - insn->eaBase = EA_BASE_NONE; - insn->eaDisplacement = EA_DISP_16; - if (readDisplacement(insn)) - return -1; - } else { - insn->eaBase = (EABase)(eaBaseBase + rm); - insn->eaDisplacement = EA_DISP_NONE; - } - break; - case 0x1: - insn->eaBase = (EABase)(eaBaseBase + rm); - insn->eaDisplacement = EA_DISP_8; - insn->displacementSize = 1; - if (readDisplacement(insn)) - return -1; - break; - case 0x2: - insn->eaBase = (EABase)(eaBaseBase + rm); - insn->eaDisplacement = EA_DISP_16; - if (readDisplacement(insn)) - return -1; - break; - case 0x3: - insn->eaBase = (EABase)(insn->eaRegBase + rm); - if (readDisplacement(insn)) - return -1; - break; - } - break; - } - case 4: - case 8: { - EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); - - switch (mod) { - case 0x0: - insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ - // In determining whether RIP-relative mode is used (rm=5), - // or whether a SIB byte is present (rm=4), - // the extension bits (REX.b and EVEX.x) are ignored. - switch (rm & 7) { - case 0x4: // SIB byte is present - insn->eaBase = (insn->addressSize == 4 ? - EA_BASE_sib : EA_BASE_sib64); - if (readSIB(insn) || readDisplacement(insn)) - return -1; - break; - case 0x5: // RIP-relative - insn->eaBase = EA_BASE_NONE; - insn->eaDisplacement = EA_DISP_32; - if (readDisplacement(insn)) - return -1; - break; - default: - insn->eaBase = (EABase)(eaBaseBase + rm); - break; - } - break; - case 0x1: - insn->displacementSize = 1; - LLVM_FALLTHROUGH; - case 0x2: - insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); - switch (rm & 7) { - case 0x4: // SIB byte is present - insn->eaBase = EA_BASE_sib; - if (readSIB(insn) || readDisplacement(insn)) - return -1; - break; - default: - insn->eaBase = (EABase)(eaBaseBase + rm); - if (readDisplacement(insn)) - return -1; - break; - } - break; - case 0x3: - insn->eaDisplacement = EA_DISP_NONE; - insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm); - break; - } - break; - } - } /* switch (insn->addressSize) */ - - return 0; -} - -#define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \ - static uint16_t name(struct InternalInstruction *insn, \ - OperandType type, \ - uint8_t index, \ - uint8_t *valid) { \ - *valid = 1; \ - switch (type) { \ - default: \ - debug("Unhandled register type"); \ - *valid = 0; \ - return 0; \ - case TYPE_Rv: \ - return base + index; \ - case TYPE_R8: \ - index &= mask; \ - if (index > 0xf) \ - *valid = 0; \ - if (insn->rexPrefix && \ - index >= 4 && index <= 7) { \ - return prefix##_SPL + (index - 4); \ - } else { \ - return prefix##_AL + index; \ - } \ - case TYPE_R16: \ - index &= mask; \ - if (index > 0xf) \ - *valid = 0; \ - return prefix##_AX + index; \ - case TYPE_R32: \ - index &= mask; \ - if (index > 0xf) \ - *valid = 0; \ - return prefix##_EAX + index; \ - case TYPE_R64: \ - index &= mask; \ - if (index > 0xf) \ - *valid = 0; \ - return prefix##_RAX + index; \ - case TYPE_ZMM: \ - return prefix##_ZMM0 + index; \ - case TYPE_YMM: \ - return prefix##_YMM0 + index; \ - case TYPE_XMM: \ - return prefix##_XMM0 + index; \ - case TYPE_VK: \ - index &= 0xf; \ - if (index > 7) \ - *valid = 0; \ - return prefix##_K0 + index; \ - case TYPE_VK_PAIR: \ - if (index > 7) \ - *valid = 0; \ - return prefix##_K0_K1 + (index / 2); \ - case TYPE_MM64: \ - return prefix##_MM0 + (index & 0x7); \ - case TYPE_SEGMENTREG: \ - if ((index & 7) > 5) \ - *valid = 0; \ - return prefix##_ES + (index & 7); \ - case TYPE_DEBUGREG: \ - return prefix##_DR0 + index; \ - case TYPE_CONTROLREG: \ - return prefix##_CR0 + index; \ - case TYPE_BNDR: \ - if (index > 3) \ - *valid = 0; \ - return prefix##_BND0 + index; \ - case TYPE_MVSIBX: \ - return prefix##_XMM0 + index; \ - case TYPE_MVSIBY: \ - return prefix##_YMM0 + index; \ - case TYPE_MVSIBZ: \ - return prefix##_ZMM0 + index; \ - } \ - } - -/* - * fixup*Value - Consults an operand type to determine the meaning of the - * reg or R/M field. If the operand is an XMM operand, for example, an - * operand would be XMM0 instead of AX, which readModRM() would otherwise - * misinterpret it as. - * - * @param insn - The instruction containing the operand. - * @param type - The operand type. - * @param index - The existing value of the field as reported by readModRM(). - * @param valid - The address of a uint8_t. The target is set to 1 if the - * field is valid for the register class; 0 if not. - * @return - The proper value. - */ -GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f) -GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf) - -/* - * fixupReg - Consults an operand specifier to determine which of the - * fixup*Value functions to use in correcting readModRM()'ss interpretation. - * - * @param insn - See fixup*Value(). - * @param op - The operand specifier. - * @return - 0 if fixup was successful; -1 if the register returned was - * invalid for its class. - */ -static int fixupReg(struct InternalInstruction *insn, - const struct OperandSpecifier *op) { - uint8_t valid; - - dbgprintf(insn, "fixupReg()"); - - switch ((OperandEncoding)op->encoding) { - default: - debug("Expected a REG or R/M encoding in fixupReg"); - return -1; - case ENCODING_VVVV: - insn->vvvv = (Reg)fixupRegValue(insn, - (OperandType)op->type, - insn->vvvv, - &valid); - if (!valid) - return -1; - break; - case ENCODING_REG: - insn->reg = (Reg)fixupRegValue(insn, - (OperandType)op->type, - insn->reg - insn->regBase, - &valid); - if (!valid) - return -1; - break; - CASE_ENCODING_RM: - if (insn->eaBase >= insn->eaRegBase) { - insn->eaBase = (EABase)fixupRMValue(insn, - (OperandType)op->type, - insn->eaBase - insn->eaRegBase, - &valid); - if (!valid) - return -1; - } - break; - } - - return 0; -} - -/* - * readOpcodeRegister - Reads an operand from the opcode field of an - * instruction and interprets it appropriately given the operand width. - * Handles AddRegFrm instructions. - * - * @param insn - the instruction whose opcode field is to be read. - * @param size - The width (in bytes) of the register being specified. - * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means - * RAX. - * @return - 0 on success; nonzero otherwise. - */ -static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { - dbgprintf(insn, "readOpcodeRegister()"); - - if (size == 0) - size = insn->registerSize; - - switch (size) { - case 1: - insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) - | (insn->opcode & 7))); - if (insn->rexPrefix && - insn->opcodeRegister >= MODRM_REG_AL + 0x4 && - insn->opcodeRegister < MODRM_REG_AL + 0x8) { - insn->opcodeRegister = (Reg)(MODRM_REG_SPL - + (insn->opcodeRegister - MODRM_REG_AL - 4)); - } - - break; - case 2: - insn->opcodeRegister = (Reg)(MODRM_REG_AX - + ((bFromREX(insn->rexPrefix) << 3) - | (insn->opcode & 7))); - break; - case 4: - insn->opcodeRegister = (Reg)(MODRM_REG_EAX - + ((bFromREX(insn->rexPrefix) << 3) - | (insn->opcode & 7))); - break; - case 8: - insn->opcodeRegister = (Reg)(MODRM_REG_RAX - + ((bFromREX(insn->rexPrefix) << 3) - | (insn->opcode & 7))); - break; - } - - return 0; -} - -/* - * readImmediate - Consumes an immediate operand from an instruction, given the - * desired operand size. - * - * @param insn - The instruction whose operand is to be read. - * @param size - The width (in bytes) of the operand. - * @return - 0 if the immediate was successfully consumed; nonzero - * otherwise. - */ -static int readImmediate(struct InternalInstruction* insn, uint8_t size) { - uint8_t imm8; - uint16_t imm16; - uint32_t imm32; - uint64_t imm64; - - dbgprintf(insn, "readImmediate()"); - - if (insn->numImmediatesConsumed == 2) { - debug("Already consumed two immediates"); - return -1; - } - - if (size == 0) - size = insn->immediateSize; - else - insn->immediateSize = size; - insn->immediateOffset = insn->readerCursor - insn->startLocation; - - switch (size) { - case 1: - if (consumeByte(insn, &imm8)) - return -1; - insn->immediates[insn->numImmediatesConsumed] = imm8; - break; - case 2: - if (consumeUInt16(insn, &imm16)) - return -1; - insn->immediates[insn->numImmediatesConsumed] = imm16; - break; - case 4: - if (consumeUInt32(insn, &imm32)) - return -1; - insn->immediates[insn->numImmediatesConsumed] = imm32; - break; - case 8: - if (consumeUInt64(insn, &imm64)) - return -1; - insn->immediates[insn->numImmediatesConsumed] = imm64; - break; - } - - insn->numImmediatesConsumed++; - - return 0; -} - -/* - * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix. - * - * @param insn - The instruction whose operand is to be read. - * @return - 0 if the vvvv was successfully consumed; nonzero - * otherwise. - */ -static int readVVVV(struct InternalInstruction* insn) { - dbgprintf(insn, "readVVVV()"); - - int vvvv; - if (insn->vectorExtensionType == TYPE_EVEX) - vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 | - vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2])); - else if (insn->vectorExtensionType == TYPE_VEX_3B) - vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]); - else if (insn->vectorExtensionType == TYPE_VEX_2B) - vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]); - else if (insn->vectorExtensionType == TYPE_XOP) - vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]); - else - return -1; - - if (insn->mode != MODE_64BIT) - vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later. - - insn->vvvv = static_cast<Reg>(vvvv); - return 0; -} - -/* - * readMaskRegister - Reads an mask register from the opcode field of an - * instruction. - * - * @param insn - The instruction whose opcode field is to be read. - * @return - 0 on success; nonzero otherwise. - */ -static int readMaskRegister(struct InternalInstruction* insn) { - dbgprintf(insn, "readMaskRegister()"); - - if (insn->vectorExtensionType != TYPE_EVEX) - return -1; - - insn->writemask = - static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3])); - return 0; -} - -/* - * readOperands - Consults the specifier for an instruction and consumes all - * operands for that instruction, interpreting them as it goes. - * - * @param insn - The instruction whose operands are to be read and interpreted. - * @return - 0 if all operands could be read; nonzero otherwise. - */ -static int readOperands(struct InternalInstruction* insn) { - int hasVVVV, needVVVV; - int sawRegImm = 0; - - dbgprintf(insn, "readOperands()"); - - /* If non-zero vvvv specified, need to make sure one of the operands - uses it. */ - hasVVVV = !readVVVV(insn); - needVVVV = hasVVVV && (insn->vvvv != 0); - - for (const auto &Op : x86OperandSets[insn->spec->operands]) { - switch (Op.encoding) { - case ENCODING_NONE: - case ENCODING_SI: - case ENCODING_DI: - break; - CASE_ENCODING_VSIB: - // VSIB can use the V2 bit so check only the other bits. - if (needVVVV) - needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0); - if (readModRM(insn)) - return -1; - - // Reject if SIB wasn't used. - if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64) - return -1; - - // If sibIndex was set to SIB_INDEX_NONE, index offset is 4. - if (insn->sibIndex == SIB_INDEX_NONE) - insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4); - - // If EVEX.v2 is set this is one of the 16-31 registers. - if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT && - v2FromEVEX4of4(insn->vectorExtensionPrefix[3])) - insn->sibIndex = (SIBIndex)(insn->sibIndex + 16); - - // Adjust the index register to the correct size. - switch ((OperandType)Op.type) { - default: - debug("Unhandled VSIB index type"); - return -1; - case TYPE_MVSIBX: - insn->sibIndex = (SIBIndex)(SIB_INDEX_XMM0 + - (insn->sibIndex - insn->sibIndexBase)); - break; - case TYPE_MVSIBY: - insn->sibIndex = (SIBIndex)(SIB_INDEX_YMM0 + - (insn->sibIndex - insn->sibIndexBase)); - break; - case TYPE_MVSIBZ: - insn->sibIndex = (SIBIndex)(SIB_INDEX_ZMM0 + - (insn->sibIndex - insn->sibIndexBase)); - break; - } - - // Apply the AVX512 compressed displacement scaling factor. - if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) - insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB); - break; - case ENCODING_REG: - CASE_ENCODING_RM: - if (readModRM(insn)) - return -1; - if (fixupReg(insn, &Op)) - return -1; - // Apply the AVX512 compressed displacement scaling factor. - if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) - insn->displacement *= 1 << (Op.encoding - ENCODING_RM); - break; - case ENCODING_IB: - if (sawRegImm) { - /* Saw a register immediate so don't read again and instead split the - previous immediate. FIXME: This is a hack. */ - insn->immediates[insn->numImmediatesConsumed] = - insn->immediates[insn->numImmediatesConsumed - 1] & 0xf; - ++insn->numImmediatesConsumed; - break; - } - if (readImmediate(insn, 1)) - return -1; - if (Op.type == TYPE_XMM || Op.type == TYPE_YMM) - sawRegImm = 1; - break; - case ENCODING_IW: - if (readImmediate(insn, 2)) - return -1; - break; - case ENCODING_ID: - if (readImmediate(insn, 4)) - return -1; - break; - case ENCODING_IO: - if (readImmediate(insn, 8)) - return -1; - break; - case ENCODING_Iv: - if (readImmediate(insn, insn->immediateSize)) - return -1; - break; - case ENCODING_Ia: - if (readImmediate(insn, insn->addressSize)) - return -1; - break; - case ENCODING_IRC: - insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) | - lFromEVEX4of4(insn->vectorExtensionPrefix[3]); - break; - case ENCODING_RB: - if (readOpcodeRegister(insn, 1)) - return -1; - break; - case ENCODING_RW: - if (readOpcodeRegister(insn, 2)) - return -1; - break; - case ENCODING_RD: - if (readOpcodeRegister(insn, 4)) - return -1; - break; - case ENCODING_RO: - if (readOpcodeRegister(insn, 8)) - return -1; - break; - case ENCODING_Rv: - if (readOpcodeRegister(insn, 0)) - return -1; - break; - case ENCODING_CC: - insn->immediates[1] = insn->opcode & 0xf; - break; - case ENCODING_FP: - break; - case ENCODING_VVVV: - needVVVV = 0; /* Mark that we have found a VVVV operand. */ - if (!hasVVVV) - return -1; - if (insn->mode != MODE_64BIT) - insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7); - if (fixupReg(insn, &Op)) - return -1; - break; - case ENCODING_WRITEMASK: - if (readMaskRegister(insn)) - return -1; - break; - case ENCODING_DUP: - break; - default: - dbgprintf(insn, "Encountered an operand with an unknown encoding."); - return -1; - } - } - - /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */ - if (needVVVV) return -1; - - return 0; -} - -/* - * decodeInstruction - Reads and interprets a full instruction provided by the - * user. - * - * @param insn - A pointer to the instruction to be populated. Must be - * pre-allocated. - * @param reader - The function to be used to read the instruction's bytes. - * @param readerArg - A generic argument to be passed to the reader to store - * any internal state. - * @param logger - If non-NULL, the function to be used to write log messages - * and warnings. - * @param loggerArg - A generic argument to be passed to the logger to store - * any internal state. - * @param startLoc - The address (in the reader's address space) of the first - * byte in the instruction. - * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to - * decode the instruction in. - * @return - 0 if the instruction's memory could be read; nonzero if - * not. - */ -int llvm::X86Disassembler::decodeInstruction( - struct InternalInstruction *insn, byteReader_t reader, - const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg, - uint64_t startLoc, DisassemblerMode mode) { - memset(insn, 0, sizeof(struct InternalInstruction)); - - insn->reader = reader; - insn->readerArg = readerArg; - insn->dlog = logger; - insn->dlogArg = loggerArg; - insn->startLocation = startLoc; - insn->readerCursor = startLoc; - insn->mode = mode; - insn->numImmediatesConsumed = 0; - - if (readPrefixes(insn) || - readOpcode(insn) || - getID(insn, miiArg) || - insn->instructionID == 0 || - readOperands(insn)) - return -1; - - insn->operands = x86OperandSets[insn->spec->operands]; - - insn->length = insn->readerCursor - insn->startLocation; - - dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", - startLoc, insn->readerCursor, insn->length); - - if (insn->length > 15) - dbgprintf(insn, "Instruction exceeds 15-byte limit"); - - return 0; -} diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 7c0a42c019e3..147fe46d81b9 100644 --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -19,6 +19,9 @@ #include "llvm/Support/X86DisassemblerDecoderCommon.h" namespace llvm { + +class MCInstrInfo; + namespace X86Disassembler { // Accessor functions for various fields of an Intel instruction @@ -446,12 +449,12 @@ enum SIBBase { }; /// Possible displacement types for effective-address computations. -typedef enum { +enum EADisplacement { EA_DISP_NONE, EA_DISP_8, EA_DISP_16, EA_DISP_32 -} EADisplacement; +}; /// All possible values of the reg field in the ModR/M byte. enum Reg { @@ -502,25 +505,6 @@ enum VectorExtensionType { TYPE_XOP = 0x4 }; -/// Type for the byte reader that the consumer must provide to -/// the decoder. Reads a single byte from the instruction's address space. -/// \param arg A baton that the consumer can associate with any internal -/// state that it needs. -/// \param byte A pointer to a single byte in memory that should be set to -/// contain the value at address. -/// \param address The address in the instruction's address space that should -/// be read from. -/// \return -1 if the byte cannot be read for any reason; 0 otherwise. -typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address); - -/// Type for the logging function that the consumer can provide to -/// get debugging output from the decoder. -/// \param arg A baton that the consumer can associate with any internal -/// state that it needs. -/// \param log A string that contains the message. Will be reused after -/// the logger returns. -typedef void (*dlog_t)(void *arg, const char *log); - /// The specification for how to extract and interpret a full instruction and /// its operands. struct InstructionSpecifier { @@ -529,18 +513,11 @@ struct InstructionSpecifier { /// The x86 internal instruction, which is produced by the decoder. struct InternalInstruction { - // Reader interface (C) - byteReader_t reader; // Opaque value passed to the reader - const void* readerArg; + llvm::ArrayRef<uint8_t> bytes; // The address of the next byte to read via the reader uint64_t readerCursor; - // Logger interface (C) - dlog_t dlog; - // Opaque value passed to the logger - void* dlogArg; - // General instruction information // The mode to disassemble for (64-bit, protected, real) @@ -616,11 +593,9 @@ struct InternalInstruction { uint8_t modRM; // The SIB byte, used for more complex 32- or 64-bit memory operands - bool consumedSIB; uint8_t sib; // The displacement, used for memory operands - bool consumedDisplacement; int32_t displacement; // Immediates. There can be two in some cases @@ -657,38 +632,6 @@ struct InternalInstruction { ArrayRef<OperandSpecifier> operands; }; -/// Decode one instruction and store the decoding results in -/// a buffer provided by the consumer. -/// \param insn The buffer to store the instruction in. Allocated by the -/// consumer. -/// \param reader The byteReader_t for the bytes to be read. -/// \param readerArg An argument to pass to the reader for storing context -/// specific to the consumer. May be NULL. -/// \param logger The dlog_t to be used in printing status messages from the -/// disassembler. May be NULL. -/// \param loggerArg An argument to pass to the logger for storing context -/// specific to the logger. May be NULL. -/// \param startLoc The address (in the reader's address space) of the first -/// byte in the instruction. -/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in. -/// \return Nonzero if there was an error during decode, 0 otherwise. -int decodeInstruction(InternalInstruction *insn, - byteReader_t reader, - const void *readerArg, - dlog_t logger, - void *loggerArg, - const void *miiArg, - uint64_t startLoc, - DisassemblerMode mode); - -/// Print a message to debugs() -/// \param file The name of the file printing the debug message. -/// \param line The line number that printed the debug message. -/// \param s The message to print. -void Debug(const char *file, unsigned line, const char *s); - -StringRef GetInstrName(unsigned Opcode, const void *mii); - } // namespace X86Disassembler } // namespace llvm diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index ed2ee55ff2a5..675a9c377b12 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -38,8 +38,9 @@ void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">"); } -void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { +void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address, + StringRef Annot, const MCSubtargetInfo &STI, + raw_ostream &OS) { // If verbose assembly is enabled, we can print some informative comments. if (CommentStream) HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII); @@ -69,7 +70,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // Try to print any aliases first. else if (!printAliasInstr(MI, OS) && !printVecCompareInstr(MI, OS)) - printInstruction(MI, OS); + printInstruction(MI, Address, OS); // Next always print the annotation. printAnnotation(OS, Annot); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h index 747ddd30a2d9..3d5d384dc4a0 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h @@ -24,8 +24,8 @@ public: : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {} void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) override; + void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, + const MCSubtargetInfo &STI, raw_ostream &OS) override; bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS); // Autogenerated by tblgen, returns true if we successfully printed an @@ -35,7 +35,7 @@ public: unsigned PrintMethodIdx, raw_ostream &O); // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &OS); + void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS); static const char *getRegisterName(unsigned RegNo); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index f08fcb575bf0..dffda5217675 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -12,55 +12,95 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" + using namespace llvm; -static unsigned getFixupKindSize(unsigned Kind) { - switch (Kind) { - default: - llvm_unreachable("invalid fixup kind!"); - case FK_NONE: - return 0; - case FK_PCRel_1: - case FK_SecRel_1: - case FK_Data_1: - return 1; - case FK_PCRel_2: - case FK_SecRel_2: - case FK_Data_2: - return 2; - case FK_PCRel_4: - case X86::reloc_riprel_4byte: - case X86::reloc_riprel_4byte_relax: - case X86::reloc_riprel_4byte_relax_rex: - case X86::reloc_riprel_4byte_movq_load: - case X86::reloc_signed_4byte: - case X86::reloc_signed_4byte_relax: - case X86::reloc_global_offset_table: - case X86::reloc_branch_4byte_pcrel: - case FK_SecRel_4: - case FK_Data_4: - return 4; - case FK_PCRel_8: - case FK_SecRel_8: - case FK_Data_8: - case X86::reloc_global_offset_table8: - return 8; +namespace { +/// A wrapper for holding a mask of the values from X86::AlignBranchBoundaryKind +class X86AlignBranchKind { +private: + uint8_t AlignBranchKind = 0; + +public: + void operator=(const std::string &Val) { + if (Val.empty()) + return; + SmallVector<StringRef, 6> BranchTypes; + StringRef(Val).split(BranchTypes, '+', -1, false); + for (auto BranchType : BranchTypes) { + if (BranchType == "fused") + addKind(X86::AlignBranchFused); + else if (BranchType == "jcc") + addKind(X86::AlignBranchJcc); + else if (BranchType == "jmp") + addKind(X86::AlignBranchJmp); + else if (BranchType == "call") + addKind(X86::AlignBranchCall); + else if (BranchType == "ret") + addKind(X86::AlignBranchRet); + else if (BranchType == "indirect") + addKind(X86::AlignBranchIndirect); + else { + report_fatal_error( + "'-x86-align-branch 'The branches's type is combination of jcc, " + "fused, jmp, call, ret, indirect.(plus separated)", + false); + } + } } -} -namespace { + operator uint8_t() const { return AlignBranchKind; } + void addKind(X86::AlignBranchBoundaryKind Value) { AlignBranchKind |= Value; } +}; + +X86AlignBranchKind X86AlignBranchKindLoc; + +cl::opt<unsigned> X86AlignBranchBoundary( + "x86-align-branch-boundary", cl::init(0), + cl::desc( + "Control how the assembler should align branches with NOP. If the " + "boundary's size is not 0, it should be a power of 2 and no less " + "than 32. Branches will be aligned to prevent from being across or " + "against the boundary of specified size. The default value 0 does not " + "align branches.")); + +cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch( + "x86-align-branch", + cl::desc("Specify types of branches to align (plus separated list of " + "types). The branches's types are combination of jcc, fused, " + "jmp, call, ret, indirect."), + cl::value_desc("jcc indicates conditional jumps, fused indicates fused " + "conditional jumps, jmp indicates unconditional jumps, call " + "indicates direct and indirect calls, ret indicates rets, " + "indirect indicates indirect jumps."), + cl::location(X86AlignBranchKindLoc)); + +cl::opt<bool> X86AlignBranchWithin32BBoundaries( + "x86-branches-within-32B-boundaries", cl::init(false), + cl::desc( + "Align selected instructions to mitigate negative performance impact " + "of Intel's micro code update for errata skx102. May break " + "assumptions about labels corresponding to particular instructions, " + "and should be used with caution.")); class X86ELFObjectWriter : public MCELFObjectTargetWriter { public: @@ -71,9 +111,42 @@ public: class X86AsmBackend : public MCAsmBackend { const MCSubtargetInfo &STI; + std::unique_ptr<const MCInstrInfo> MCII; + X86AlignBranchKind AlignBranchType; + Align AlignBoundary; + + bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; + + bool needAlign(MCObjectStreamer &OS) const; + bool needAlignInst(const MCInst &Inst) const; + MCBoundaryAlignFragment * + getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const; + MCInst PrevInst; + public: X86AsmBackend(const Target &T, const MCSubtargetInfo &STI) - : MCAsmBackend(support::little), STI(STI) {} + : MCAsmBackend(support::little), STI(STI), + MCII(T.createMCInstrInfo()) { + if (X86AlignBranchWithin32BBoundaries) { + // At the moment, this defaults to aligning fused branches, unconditional + // jumps, and (unfused) conditional jumps with nops. Both the + // instructions aligned and the alignment method (nop vs prefix) may + // change in the future. + AlignBoundary = assumeAligned(32);; + AlignBranchType.addKind(X86::AlignBranchFused); + AlignBranchType.addKind(X86::AlignBranchJcc); + AlignBranchType.addKind(X86::AlignBranchJmp); + } + // Allow overriding defaults set by master flag + if (X86AlignBranchBoundary.getNumOccurrences()) + AlignBoundary = assumeAligned(X86AlignBranchBoundary); + if (X86AlignBranch.getNumOccurrences()) + AlignBranchType = X86AlignBranchKindLoc; + } + + bool allowAutoPadding() const override; + void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override; + void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override; unsigned getNumFixupKinds() const override { return X86::NumTargetFixupKinds; @@ -81,49 +154,15 @@ public: Optional<MCFixupKind> getFixupKind(StringRef Name) const override; - const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { - const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { - {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, - {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, - {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, - {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, - {"reloc_signed_4byte", 0, 32, 0}, - {"reloc_signed_4byte_relax", 0, 32, 0}, - {"reloc_global_offset_table", 0, 32, 0}, - {"reloc_global_offset_table8", 0, 64, 0}, - {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, - }; - - if (Kind < FirstTargetFixupKind) - return MCAsmBackend::getFixupKindInfo(Kind); - - assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && - "Invalid kind!"); - assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!"); - return Infos[Kind - FirstTargetFixupKind]; - } - + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override; void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved, - const MCSubtargetInfo *STI) const override { - unsigned Size = getFixupKindSize(Fixup.getKind()); - - assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); - - // Check that uppper bits are either all zeros or all ones. - // Specifically ignore overflow/underflow as long as the leakage is - // limited to the lower bits. This is to remain compatible with - // other assemblers. - assert((Size == 0 || isIntN(Size * 8 + 1, Value)) && - "Value does not fit in the Fixup field"); - - for (unsigned i = 0; i != Size; ++i) - Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); - } + const MCSubtargetInfo *STI) const override; bool mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const override; @@ -243,6 +282,200 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) { return getRelaxedOpcodeBranch(Inst, is16BitMode); } +static X86::CondCode getCondFromBranch(const MCInst &MI, + const MCInstrInfo &MCII) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + return X86::COND_INVALID; + case X86::JCC_1: { + const MCInstrDesc &Desc = MCII.get(Opcode); + return static_cast<X86::CondCode>( + MI.getOperand(Desc.getNumOperands() - 1).getImm()); + } + } +} + +static X86::SecondMacroFusionInstKind +classifySecondInstInMacroFusion(const MCInst &MI, const MCInstrInfo &MCII) { + X86::CondCode CC = getCondFromBranch(MI, MCII); + return classifySecondCondCodeInMacroFusion(CC); +} + +/// Check if the instruction uses RIP relative addressing. +static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + uint64_t TSFlags = Desc.TSFlags; + unsigned CurOp = X86II::getOperandBias(Desc); + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); + if (MemoryOperand < 0) + return false; + unsigned BaseRegNum = MemoryOperand + CurOp + X86::AddrBaseReg; + unsigned BaseReg = MI.getOperand(BaseRegNum).getReg(); + return (BaseReg == X86::RIP); +} + +/// Check if the instruction is valid as the first instruction in macro fusion. +static bool isFirstMacroFusibleInst(const MCInst &Inst, + const MCInstrInfo &MCII) { + // An Intel instruction with RIP relative addressing is not macro fusible. + if (isRIPRelative(Inst, MCII)) + return false; + X86::FirstMacroFusionInstKind FIK = + X86::classifyFirstOpcodeInMacroFusion(Inst.getOpcode()); + return FIK != X86::FirstMacroFusionInstKind::Invalid; +} + +/// Check if the two instructions will be macro-fused on the target cpu. +bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const { + const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode()); + if (!InstDesc.isConditionalBranch()) + return false; + if (!isFirstMacroFusibleInst(Cmp, *MCII)) + return false; + const X86::FirstMacroFusionInstKind CmpKind = + X86::classifyFirstOpcodeInMacroFusion(Cmp.getOpcode()); + const X86::SecondMacroFusionInstKind BranchKind = + classifySecondInstInMacroFusion(Jcc, *MCII); + return X86::isMacroFused(CmpKind, BranchKind); +} + +/// Check if the instruction has a variant symbol operand. +static bool hasVariantSymbol(const MCInst &MI) { + for (auto &Operand : MI) { + if (!Operand.isExpr()) + continue; + const MCExpr &Expr = *Operand.getExpr(); + if (Expr.getKind() == MCExpr::SymbolRef && + cast<MCSymbolRefExpr>(Expr).getKind() != MCSymbolRefExpr::VK_None) + return true; + } + return false; +} + +bool X86AsmBackend::allowAutoPadding() const { + return (AlignBoundary != Align::None() && + AlignBranchType != X86::AlignBranchNone); +} + +bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const { + if (!OS.getAllowAutoPadding()) + return false; + assert(allowAutoPadding() && "incorrect initialization!"); + + MCAssembler &Assembler = OS.getAssembler(); + MCSection *Sec = OS.getCurrentSectionOnly(); + // To be Done: Currently don't deal with Bundle cases. + if (Assembler.isBundlingEnabled() && Sec->isBundleLocked()) + return false; + + // Branches only need to be aligned in 32-bit or 64-bit mode. + if (!(STI.hasFeature(X86::Mode64Bit) || STI.hasFeature(X86::Mode32Bit))) + return false; + + return true; +} + +/// Check if the instruction operand needs to be aligned. Padding is disabled +/// before intruction which may be rewritten by linker(e.g. TLSCALL). +bool X86AsmBackend::needAlignInst(const MCInst &Inst) const { + // Linker may rewrite the instruction with variant symbol operand. + if (hasVariantSymbol(Inst)) + return false; + + const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode()); + return (InstDesc.isConditionalBranch() && + (AlignBranchType & X86::AlignBranchJcc)) || + (InstDesc.isUnconditionalBranch() && + (AlignBranchType & X86::AlignBranchJmp)) || + (InstDesc.isCall() && + (AlignBranchType & X86::AlignBranchCall)) || + (InstDesc.isReturn() && + (AlignBranchType & X86::AlignBranchRet)) || + (InstDesc.isIndirectBranch() && + (AlignBranchType & X86::AlignBranchIndirect)); +} + +static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) { + // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it. + return !F.canEmitNops(); +} + +MCBoundaryAlignFragment * +X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const { + auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment()); + if (!F || !canReuseBoundaryAlignFragment(*F)) { + F = new MCBoundaryAlignFragment(AlignBoundary); + OS.insert(F); + } + return F; +} + +/// Insert MCBoundaryAlignFragment before instructions to align branches. +void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS, + const MCInst &Inst) { + if (!needAlign(OS)) + return; + + MCFragment *CF = OS.getCurrentFragment(); + bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused; + if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) { + // Macro fusion actually happens and there is no other fragment inserted + // after the previous instruction. NOP can be emitted in PF to align fused + // jcc. + if (auto *PF = + dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) { + const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true); + const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true); + } + } else if (needAlignInst(Inst)) { + // Note: When there is at least one fragment, such as MCAlignFragment, + // inserted after the previous instruction, e.g. + // + // \code + // cmp %rax %rcx + // .align 16 + // je .Label0 + // \ endcode + // + // We will treat the JCC as a unfused branch although it may be fused + // with the CMP. + auto *F = getOrCreateBoundaryAlignFragment(OS); + F->setEmitNops(true); + F->setFused(false); + } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) { + // We don't know if macro fusion happens until the reaching the next + // instruction, so a place holder is put here if necessary. + getOrCreateBoundaryAlignFragment(OS); + } + + PrevInst = Inst; +} + +/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned +/// if necessary. +void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) { + if (!needAlign(OS)) + return; + // If the branch is emitted into a MCRelaxableFragment, we can determine the + // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the + // branch is fused, the fused branch(macro fusion pair) must be emitted into + // two fragments. Or when the branch is unfused, the branch must be emitted + // into one fragment. The MCRelaxableFragment naturally marks the end of the + // fused or unfused branch. + // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of + // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align + // other branch. + if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment())) + OS.insert(new MCBoundaryAlignFragment(AlignBoundary)); + + // Update the maximum alignment on the current section if necessary. + MCSection *Sec = OS.getCurrentSectionOnly(); + if (AlignBoundary.value() > Sec->getAlignment()) + Sec->setAlignment(AlignBoundary); +} + Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { if (STI.getTargetTriple().isOSBinFormatELF()) { if (STI.getTargetTriple().getArch() == Triple::x86_64) { @@ -256,12 +489,100 @@ Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { return MCAsmBackend::getFixupKind(Name); } +const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { + {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"reloc_signed_4byte", 0, 32, 0}, + {"reloc_signed_4byte_relax", 0, 32, 0}, + {"reloc_global_offset_table", 0, 32, 0}, + {"reloc_global_offset_table8", 0, 64, 0}, + {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!"); + return Infos[Kind - FirstTargetFixupKind]; +} + bool X86AsmBackend::shouldForceRelocation(const MCAssembler &, const MCFixup &Fixup, const MCValue &) { return Fixup.getKind() == FK_NONE; } +static unsigned getFixupKindSize(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("invalid fixup kind!"); + case FK_NONE: + return 0; + case FK_PCRel_1: + case FK_SecRel_1: + case FK_Data_1: + return 1; + case FK_PCRel_2: + case FK_SecRel_2: + case FK_Data_2: + return 2; + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_relax: + case X86::reloc_riprel_4byte_relax_rex: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case X86::reloc_signed_4byte_relax: + case X86::reloc_global_offset_table: + case X86::reloc_branch_4byte_pcrel: + case FK_SecRel_4: + case FK_Data_4: + return 4; + case FK_PCRel_8: + case FK_SecRel_8: + case FK_Data_8: + case X86::reloc_global_offset_table8: + return 8; + } +} + +void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, + const MCValue &Target, + MutableArrayRef<char> Data, + uint64_t Value, bool IsResolved, + const MCSubtargetInfo *STI) const { + unsigned Size = getFixupKindSize(Fixup.getKind()); + + assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); + + int64_t SignedValue = static_cast<int64_t>(Value); + if ((Target.isAbsolute() || IsResolved) && + getFixupKindInfo(Fixup.getKind()).Flags & + MCFixupKindInfo::FKF_IsPCRel) { + // check that PC relative fixup fits into the fixup size. + if (Size > 0 && !isIntN(Size * 8, SignedValue)) + Asm.getContext().reportError( + Fixup.getLoc(), "value of " + Twine(SignedValue) + + " is too large for field of " + Twine(Size) + + ((Size == 1) ? " byte." : " bytes.")); + } else { + // Check that uppper bits are either all zeros or all ones. + // Specifically ignore overflow/underflow as long as the leakage is + // limited to the lower bits. This is to remain compatible with + // other assemblers. + assert((Size == 0 || isIntN(Size * 8 + 1, SignedValue)) && + "Value does not fit in the Fixup field"); + } + + for (unsigned i = 0; i != Size; ++i) + Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); +} + bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const { // Branches can always be relaxed in either mode. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 6bd6c6cac7df..a4f8dd669e1e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -101,6 +101,261 @@ namespace X86 { COND_INVALID }; + + // The classification for the first instruction in macro fusion. + enum class FirstMacroFusionInstKind { + // TEST + Test, + // CMP + Cmp, + // AND + And, + // ADD, SUB + AddSub, + // INC, DEC + IncDec, + // Not valid as a first macro fusion instruction + Invalid + }; + + enum class SecondMacroFusionInstKind { + // JA, JB and variants. + AB, + // JE, JL, JG and variants. + ELG, + // JS, JP, JO and variants + SPO, + // Not a fusible jump. + Invalid, + }; + + /// \returns the type of the first instruction in macro-fusion. + inline FirstMacroFusionInstKind + classifyFirstOpcodeInMacroFusion(unsigned Opcode) { + switch (Opcode) { + default: + return FirstMacroFusionInstKind::Invalid; + // TEST + case X86::TEST16i16: + case X86::TEST16mr: + case X86::TEST16ri: + case X86::TEST16rr: + case X86::TEST32i32: + case X86::TEST32mr: + case X86::TEST32ri: + case X86::TEST32rr: + case X86::TEST64i32: + case X86::TEST64mr: + case X86::TEST64ri32: + case X86::TEST64rr: + case X86::TEST8i8: + case X86::TEST8mr: + case X86::TEST8ri: + case X86::TEST8rr: + return FirstMacroFusionInstKind::Test; + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND16rr_REV: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND32rr_REV: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND64rr_REV: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8ri8: + case X86::AND8rm: + case X86::AND8rr: + case X86::AND8rr_REV: + return FirstMacroFusionInstKind::And; + // CMP + case X86::CMP16i16: + case X86::CMP16mr: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP16rr_REV: + case X86::CMP32i32: + case X86::CMP32mr: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP32rr_REV: + case X86::CMP64i32: + case X86::CMP64mr: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP64rr_REV: + case X86::CMP8i8: + case X86::CMP8mr: + case X86::CMP8ri: + case X86::CMP8ri8: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::CMP8rr_REV: + return FirstMacroFusionInstKind::Cmp; + // ADD + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_REV: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_REV: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_REV: + case X86::ADD8i8: + case X86::ADD8ri: + case X86::ADD8ri8: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::ADD8rr_REV: + // SUB + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB16rr_REV: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB32rr_REV: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB64rr_REV: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8ri8: + case X86::SUB8rm: + case X86::SUB8rr: + case X86::SUB8rr_REV: + return FirstMacroFusionInstKind::AddSub; + // INC + case X86::INC16r: + case X86::INC16r_alt: + case X86::INC32r: + case X86::INC32r_alt: + case X86::INC64r: + case X86::INC8r: + // DEC + case X86::DEC16r: + case X86::DEC16r_alt: + case X86::DEC32r: + case X86::DEC32r_alt: + case X86::DEC64r: + case X86::DEC8r: + return FirstMacroFusionInstKind::IncDec; + } + } + + /// \returns the type of the second instruction in macro-fusion. + inline SecondMacroFusionInstKind + classifySecondCondCodeInMacroFusion(X86::CondCode CC) { + if (CC == X86::COND_INVALID) + return SecondMacroFusionInstKind::Invalid; + + switch (CC) { + default: + return SecondMacroFusionInstKind::Invalid; + // JE,JZ + case X86::COND_E: + // JNE,JNZ + case X86::COND_NE: + // JL,JNGE + case X86::COND_L: + // JLE,JNG + case X86::COND_LE: + // JG,JNLE + case X86::COND_G: + // JGE,JNL + case X86::COND_GE: + return SecondMacroFusionInstKind::ELG; + // JB,JC + case X86::COND_B: + // JNA,JBE + case X86::COND_BE: + // JA,JNBE + case X86::COND_A: + // JAE,JNC,JNB + case X86::COND_AE: + return SecondMacroFusionInstKind::AB; + // JS + case X86::COND_S: + // JNS + case X86::COND_NS: + // JP,JPE + case X86::COND_P: + // JNP,JPO + case X86::COND_NP: + // JO + case X86::COND_O: + // JNO + case X86::COND_NO: + return SecondMacroFusionInstKind::SPO; + } + } + + /// \param FirstKind kind of the first instruction in macro fusion. + /// \param SecondKind kind of the second instruction in macro fusion. + /// + /// \returns true if the two instruction can be macro fused. + inline bool isMacroFused(FirstMacroFusionInstKind FirstKind, + SecondMacroFusionInstKind SecondKind) { + switch (FirstKind) { + case X86::FirstMacroFusionInstKind::Test: + case X86::FirstMacroFusionInstKind::And: + return true; + case X86::FirstMacroFusionInstKind::Cmp: + case X86::FirstMacroFusionInstKind::AddSub: + return SecondKind == X86::SecondMacroFusionInstKind::AB || + SecondKind == X86::SecondMacroFusionInstKind::ELG; + case X86::FirstMacroFusionInstKind::IncDec: + return SecondKind == X86::SecondMacroFusionInstKind::ELG; + case X86::FirstMacroFusionInstKind::Invalid: + return false; + } + llvm_unreachable("unknown fusion type"); + } + + /// Defines the possible values of the branch boundary alignment mask. + enum AlignBranchBoundaryKind : uint8_t { + AlignBranchNone = 0, + AlignBranchFused = 1U << 0, + AlignBranchJcc = 1U << 1, + AlignBranchJmp = 1U << 2, + AlignBranchCall = 1U << 3, + AlignBranchRet = 1U << 4, + AlignBranchIndirect = 1U << 5 + }; } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that @@ -645,9 +900,8 @@ namespace X86II { NOTRACK = 1ULL << NoTrackShift }; - // getBaseOpcodeFor - This function returns the "base" X86 opcode for the - // specified machine instruction. - // + /// \returns the "base" X86 opcode for the specified machine + /// instruction. inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) { return TSFlags >> X86II::OpcodeShift; } @@ -656,8 +910,8 @@ namespace X86II { return (TSFlags & X86II::ImmMask) != 0; } - /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field - /// of the specified instruction. + /// Decode the "size of immediate" field from the TSFlags field of the + /// specified instruction. inline unsigned getSizeOfImm(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { default: llvm_unreachable("Unknown immediate size"); @@ -673,9 +927,9 @@ namespace X86II { } } - /// isImmPCRel - Return true if the immediate of the specified instruction's - /// TSFlags indicates that it is pc relative. - inline unsigned isImmPCRel(uint64_t TSFlags) { + /// \returns true if the immediate of the specified instruction's TSFlags + /// indicates that it is pc relative. + inline bool isImmPCRel(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { default: llvm_unreachable("Unknown immediate size"); case X86II::Imm8PCRel: @@ -692,9 +946,9 @@ namespace X86II { } } - /// isImmSigned - Return true if the immediate of the specified instruction's + /// \returns true if the immediate of the specified instruction's /// TSFlags indicates that it is signed. - inline unsigned isImmSigned(uint64_t TSFlags) { + inline bool isImmSigned(uint64_t TSFlags) { switch (TSFlags & X86II::ImmMask) { default: llvm_unreachable("Unknown immediate signedness"); case X86II::Imm32S: @@ -711,8 +965,8 @@ namespace X86II { } } - /// getOperandBias - compute whether all of the def operands are repeated - /// in the uses and therefore should be skipped. + /// Compute whether all of the def operands are repeated in the uses and + /// therefore should be skipped. /// This determines the start of the unique operand list. We need to determine /// if all of the defs have a corresponding tied operand in the uses. /// Unfortunately, the tied operand information is encoded in the uses not @@ -750,8 +1004,8 @@ namespace X86II { } } - /// getMemoryOperandNo - The function returns the MCInst operand # for the - /// first field of the memory operand. If the instruction doesn't have a + /// The function returns the MCInst operand # for the first field of the + /// memory operand. If the instruction doesn't have a /// memory operand, this returns -1. /// /// Note that this ignores tied operands. If there is a tied register which @@ -837,8 +1091,8 @@ namespace X86II { } } - /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or - /// higher) register? e.g. r8, xmm8, xmm13, etc. + /// \returns true if the MachineOperand is a x86-64 extended (r8 or + /// higher) register, e.g. r8, xmm8, xmm13, etc. inline bool isX86_64ExtendedReg(unsigned RegNo) { if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) || (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) || @@ -864,8 +1118,8 @@ namespace X86II { return false; } - /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher) - /// registers? e.g. zmm21, etc. + /// \returns true if the MemoryOperand is a 32 extended (zmm16 or higher) + /// registers, e.g. zmm21, etc. static inline bool is32ExtendedReg(unsigned RegNo) { return ((RegNo >= X86::XMM16 && RegNo <= X86::XMM31) || (RegNo >= X86::YMM16 && RegNo <= X86::YMM31) || @@ -878,12 +1132,12 @@ namespace X86II { reg == X86::SIL || reg == X86::DIL); } - /// isKMasked - Is this a masked instruction. + /// \returns true if this is a masked instruction. inline bool isKMasked(uint64_t TSFlags) { return (TSFlags & X86II::EVEX_K) != 0; } - /// isKMergedMasked - Is this a merge masked instruction. + /// \returns true if this is a merge masked instruction. inline bool isKMergeMasked(uint64_t TSFlags) { return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index ea28bef42569..f4bb0fbf62cd 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -36,9 +36,9 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << getRegisterName(RegNo); } -void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, - const MCSubtargetInfo &STI) { +void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address, + StringRef Annot, const MCSubtargetInfo &STI, + raw_ostream &OS) { printInstFlags(MI, OS); // In 16-bit mode, print data16 as data32. @@ -47,7 +47,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, OS << "\tdata32"; } else if (!printAliasInstr(MI, OS) && !printVecCompareInstr(MI, OS)) - printInstruction(MI, OS); + printInstruction(MI, Address, OS); // Next always print the annotation. printAnnotation(OS, Annot); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h index f32f49f7c417..b409b20cbea8 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h @@ -25,8 +25,8 @@ public: : X86InstPrinterCommon(MAI, MII, MRI) {} void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) override; + void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, + const MCSubtargetInfo &STI, raw_ostream &OS) override; bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS); // Autogenerated by tblgen, returns true if we successfully printed an @@ -36,7 +36,7 @@ public: unsigned PrintMethodIdx, raw_ostream &O); // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); + void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index ac36bf3a12fa..54a293702bd0 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -42,91 +42,68 @@ class X86MCCodeEmitter : public MCCodeEmitter { public: X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : MCII(mcii), Ctx(ctx) { - } + : MCII(mcii), Ctx(ctx) {} X86MCCodeEmitter(const X86MCCodeEmitter &) = delete; X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete; ~X86MCCodeEmitter() override = default; - bool is64BitMode(const MCSubtargetInfo &STI) const { - return STI.getFeatureBits()[X86::Mode64Bit]; - } - - bool is32BitMode(const MCSubtargetInfo &STI) const { - return STI.getFeatureBits()[X86::Mode32Bit]; - } - - bool is16BitMode(const MCSubtargetInfo &STI) const { - return STI.getFeatureBits()[X86::Mode16Bit]; - } - - /// Is16BitMemOperand - Return true if the specified instruction has - /// a 16-bit memory operand. Op specifies the operand # of the memoperand. - bool Is16BitMemOperand(const MCInst &MI, unsigned Op, - const MCSubtargetInfo &STI) const { - const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp); + void emitPrefix(const MCInst &MI, raw_ostream &OS, + const MCSubtargetInfo &STI) const override; - if (is16BitMode(STI) && BaseReg.getReg() == 0 && - Disp.isImm() && Disp.getImm() < 0x10000) - return true; - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) - return true; - return false; - } + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; - unsigned GetX86RegNum(const MCOperand &MO) const { +private: + unsigned getX86RegNum(const MCOperand &MO) const { return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; } unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const { return Ctx.getRegisterInfo()->getEncodingValue( - MI.getOperand(OpNum).getReg()); + MI.getOperand(OpNum).getReg()); } - // Does this register require a bit to be set in REX prefix. + /// \param MI a single low-level machine instruction. + /// \param OpNum the operand #. + /// \returns true if the OpNumth operand of MI require a bit to be set in + /// REX prefix. bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const { return (getX86RegEncoding(MI, OpNum) >> 3) & 1; } - void EmitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const { + void emitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const { OS << (char)C; ++CurByte; } - void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, + void emitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, raw_ostream &OS) const { // Output the constant in little endian byte order. for (unsigned i = 0; i != Size; ++i) { - EmitByte(Val & 255, CurByte, OS); + emitByte(Val & 255, CurByte, OS); Val >>= 8; } } - void EmitImmediate(const MCOperand &Disp, SMLoc Loc, - unsigned ImmSize, MCFixupKind FixupKind, - unsigned &CurByte, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - int ImmOffset = 0) const; + void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize, + MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const; - static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) { + static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) { assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); return RM | (RegOpcode << 3) | (Mod << 6); } - void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld, + void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld, unsigned &CurByte, raw_ostream &OS) const { - EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS); + emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), CurByte, OS); } - void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base, + void emitSIBByte(unsigned SS, unsigned Index, unsigned Base, unsigned &CurByte, raw_ostream &OS) const { - // SIB byte is in the same format as the ModRMByte. - EmitByte(ModRMByte(SS, Index, Base), CurByte, OS); + // SIB byte is in the same format as the modRMByte. + emitByte(modRMByte(SS, Index, Base), CurByte, OS); } void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField, @@ -134,43 +111,39 @@ public: raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + void emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned &CurByte, + bool &Rex, const MCInst &MI, const MCInstrDesc &Desc, + const MCSubtargetInfo &STI, raw_ostream &OS) const; - void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + void emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const; - void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand, + void emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand, const MCInst &MI, raw_ostream &OS) const; bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, const MCSubtargetInfo &STI, raw_ostream &OS) const; - uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, - int MemOperand, const MCInstrDesc &Desc) const; - - bool isPCRel32Branch(const MCInst &MI) const; + uint8_t determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand, + const MCInstrDesc &Desc) const; }; } // end anonymous namespace -/// isDisp8 - Return true if this signed displacement fits in a 8-bit -/// sign-extended field. -static bool isDisp8(int Value) { - return Value == (int8_t)Value; -} +/// \returns true if this signed displacement fits in a 8-bit sign-extended +/// field. +static bool isDisp8(int Value) { return Value == (int8_t)Value; } -/// isCDisp8 - Return true if this signed displacement fits in a 8-bit -/// compressed dispacement field. -static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { +/// \returns true if this signed displacement fits in a 8-bit compressed +/// dispacement field. +static bool isCDisp8(uint64_t TSFlags, int Value, int &CValue) { assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) && "Compressed 8-bit displacement is only valid for EVEX inst."); unsigned CD8_Scale = - (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; + (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; if (CD8_Scale == 0) { CValue = Value; return isDisp8(Value); @@ -188,26 +161,49 @@ static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { return Ret; } -/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate -/// in an instruction with the specified TSFlags. +/// \returns the appropriate fixup kind to use for an immediate in an +/// instruction with the specified TSFlags. static MCFixupKind getImmFixupKind(uint64_t TSFlags) { unsigned Size = X86II::getSizeOfImm(TSFlags); bool isPCRel = X86II::isImmPCRel(TSFlags); if (X86II::isImmSigned(TSFlags)) { switch (Size) { - default: llvm_unreachable("Unsupported signed fixup size!"); - case 4: return MCFixupKind(X86::reloc_signed_4byte); + default: + llvm_unreachable("Unsupported signed fixup size!"); + case 4: + return MCFixupKind(X86::reloc_signed_4byte); } } return MCFixup::getKindForSize(Size, isPCRel); } -/// Is32BitMemOperand - Return true if the specified instruction has -/// a 32-bit memory operand. Op specifies the operand # of the memoperand. -static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { - const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); +/// \param Op operand # of the memory operand. +/// +/// \returns true if the specified instruction has a 16-bit memory operand. +static bool is16BitMemOperand(const MCInst &MI, unsigned Op, + const MCSubtargetInfo &STI) { + const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); + const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp); + + if (STI.hasFeature(X86::Mode16Bit) && BaseReg.getReg() == 0 && Disp.isImm() && + Disp.getImm() < 0x10000) + return true; + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// \param Op operand # of the memory operand. +/// +/// \returns true if the specified instruction has a 32-bit memory operand. +static bool is32BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); if ((BaseReg.getReg() != 0 && X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || @@ -223,12 +219,13 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { return false; } -/// Is64BitMemOperand - Return true if the specified instruction has -/// a 64-bit memory operand. Op specifies the operand # of the memoperand. +/// \param Op operand # of the memory operand. +/// +/// \returns true if the specified instruction has a 64-bit memory operand. #ifndef NDEBUG -static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) { - const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); +static bool is64BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); if ((BaseReg.getReg() != 0 && X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || @@ -239,19 +236,15 @@ static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) { } #endif -/// StartsWithGlobalOffsetTable - Check if this expression starts with -/// _GLOBAL_OFFSET_TABLE_ and if it is of the form -/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF -/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that -/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start -/// of a binary expression. -enum GlobalOffsetTableExprKind { - GOT_None, - GOT_Normal, - GOT_SymDiff -}; +enum GlobalOffsetTableExprKind { GOT_None, GOT_Normal, GOT_SymDiff }; + +/// Check if this expression starts with _GLOBAL_OFFSET_TABLE_ and if it is +/// of the form _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on +/// ELF i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that +/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start of a +/// binary expression. static GlobalOffsetTableExprKind -StartsWithGlobalOffsetTable(const MCExpr *Expr) { +startsWithGlobalOffsetTable(const MCExpr *Expr) { const MCExpr *RHS = nullptr; if (Expr->getKind() == MCExpr::Binary) { const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); @@ -262,7 +255,7 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) { if (Expr->getKind() != MCExpr::SymbolRef) return GOT_None; - const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr); const MCSymbol &S = Ref->getSymbol(); if (S.getName() != "_GLOBAL_OFFSET_TABLE_") return GOT_None; @@ -271,15 +264,15 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) { return GOT_Normal; } -static bool HasSecRelSymbolRef(const MCExpr *Expr) { +static bool hasSecRelSymbolRef(const MCExpr *Expr) { if (Expr->getKind() == MCExpr::SymbolRef) { - const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr); return Ref->getKind() == MCSymbolRefExpr::VK_SECREL; } return false; } -bool X86MCCodeEmitter::isPCRel32Branch(const MCInst &MI) const { +static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MCII.get(Opcode); if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) || @@ -295,18 +288,18 @@ bool X86MCCodeEmitter::isPCRel32Branch(const MCInst &MI) const { return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None; } -void X86MCCodeEmitter:: -EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, - MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { +void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc, + unsigned Size, MCFixupKind FixupKind, + unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + int ImmOffset) const { const MCExpr *Expr = nullptr; if (DispOp.isImm()) { // If this is a simple integer displacement that doesn't require a // relocation, emit it now. - if (FixupKind != FK_PCRel_1 && - FixupKind != FK_PCRel_2 && + if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 && FixupKind != FK_PCRel_4) { - EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS); + emitConstant(DispOp.getImm() + ImmOffset, Size, CurByte, OS); return; } Expr = MCConstantExpr::create(DispOp.getImm(), Ctx); @@ -315,10 +308,9 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, } // If we have an immoffset, add it to the expression. - if ((FixupKind == FK_Data_4 || - FixupKind == FK_Data_8 || + if ((FixupKind == FK_Data_4 || FixupKind == FK_Data_8 || FixupKind == MCFixupKind(X86::reloc_signed_4byte))) { - GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr); + GlobalOffsetTableExprKind Kind = startsWithGlobalOffsetTable(Expr); if (Kind != GOT_None) { assert(ImmOffset == 0); @@ -332,13 +324,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, if (Kind == GOT_Normal) ImmOffset = CurByte; } else if (Expr->getKind() == MCExpr::SymbolRef) { - if (HasSecRelSymbolRef(Expr)) { + if (hasSecRelSymbolRef(Expr)) { FixupKind = MCFixupKind(FK_SecRel_4); } } else if (Expr->getKind() == MCExpr::Binary) { - const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr); - if (HasSecRelSymbolRef(Bin->getLHS()) - || HasSecRelSymbolRef(Bin->getRHS())) { + const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr *>(Expr); + if (hasSecRelSymbolRef(Bin->getLHS()) || + hasSecRelSymbolRef(Bin->getRHS())) { FixupKind = MCFixupKind(FK_SecRel_4); } } @@ -356,7 +348,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_: // leaq _GLOBAL_OFFSET_TABLE_(%rip), %r15 // this needs to be a GOTPC32 relocation. - if (StartsWithGlobalOffsetTable(Expr) != GOT_None) + if (startsWithGlobalOffsetTable(Expr) != GOT_None) FixupKind = MCFixupKind(X86::reloc_global_offset_table); } if (FixupKind == FK_PCRel_2) @@ -370,7 +362,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, // Emit a symbolic constant as a fixup and 4 zeros. Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc)); - EmitConstant(0, Size, CurByte, OS); + emitConstant(0, Size, CurByte, OS); } void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, @@ -379,19 +371,20 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned &CurByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp); - const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg); - const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); - const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp); + const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); + const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt); + const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP || - BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode - assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode"); + BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode + assert(STI.hasFeature(X86::Mode64Bit) && + "Rip-relative addressing requires 64-bit mode"); assert(IndexReg.getReg() == 0 && "Invalid rip-relative address"); - EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS); unsigned Opcode = MI.getOpcode(); // movq loads are handled with a special relocation form which allows the @@ -432,20 +425,20 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, ? X86II::getSizeOfImm(TSFlags) : 0; - EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), - CurByte, OS, Fixups, -ImmSize); + emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, + Fixups, -ImmSize); return; } - unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U; + unsigned BaseRegNo = BaseReg ? getX86RegNum(Base) : -1U; // 16-bit addressing forms of the ModR/M byte have a different encoding for // the R/M field and are far more limited in which registers can be used. - if (Is16BitMemOperand(MI, Op, STI)) { + if (is16BitMemOperand(MI, Op, STI)) { if (BaseReg) { // For 32-bit addressing, the row and column values in Table 2-2 are // basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with - // some special cases. And GetX86RegNum reflects that numbering. + // some special cases. And getX86RegNum reflects that numbering. // For 16-bit addressing it's more fun, as shown in the SDM Vol 2A, // Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only // use SI/DI/BP/BX, which have "row" values 4-7 in no particular order, @@ -454,13 +447,13 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // // R16Table[] is a lookup from the normal RegNo, to the row values from // Table 2-1 for 16-bit addressing modes. Where zero means disallowed. - static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 }; + static const unsigned R16Table[] = {0, 0, 0, 7, 0, 6, 4, 5}; unsigned RMfield = R16Table[BaseRegNo]; assert(RMfield && "invalid 16-bit base register"); if (IndexReg.getReg()) { - unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)]; + unsigned IndexReg16 = R16Table[getX86RegNum(IndexReg)]; assert(IndexReg16 && "invalid 16-bit index register"); // We must have one of SI/DI (4,5), and one of BP/BX (6,7). @@ -479,23 +472,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, if (Disp.isImm() && isDisp8(Disp.getImm())) { if (Disp.getImm() == 0 && RMfield != 6) { // There is no displacement; just the register. - EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, RMfield), CurByte, OS); return; } // Use the [REG]+disp8 form, including for [BP] which cannot be encoded. - EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + emitByte(modRMByte(1, RegOpcodeField, RMfield), CurByte, OS); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); return; } // This is the [REG]+disp16 case. - EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, RMfield), CurByte, OS); } else { // There is no BaseReg; this is the plain [disp16] case. - EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 6), CurByte, OS); } // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases. - EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups); + emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups); return; } @@ -504,7 +497,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table // 2-7) and absolute references. - if (// The SIB byte must be used if there is an index register. + if ( // The SIB byte must be used if there is an index register. IndexReg.getReg() == 0 && // The SIB byte must be used if the base is ESP/RSP/R12, all of which // encode to an R/M value of 4, which indicates that a SIB byte is @@ -512,11 +505,11 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, BaseRegNo != N86::ESP && // If there is no base register and we're in 64-bit mode, we need a SIB // byte to emit an addr that is just 'disp32' (the non-RIP relative form). - (!is64BitMode(STI) || BaseReg != 0)) { + (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) { - if (BaseReg == 0) { // [disp32] in X86-32 mode - EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups); + if (BaseReg == 0) { // [disp32] in X86-32 mode + emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS); + emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups); return; } @@ -526,7 +519,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // by emitting a displacement of 0 below. if (BaseRegNo != N86::EBP) { if (Disp.isImm() && Disp.getImm() == 0) { - EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); return; } @@ -537,7 +530,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // This is exclusively used by call *a@tlscall(base). The relocation // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning. Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc())); - EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); return; } } @@ -546,70 +539,70 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. if (Disp.isImm()) { if (!HasEVEX && isDisp8(Disp.getImm())) { - EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); return; } // Try EVEX compressed 8-bit displacement first; if failed, fall back to // 32-bit displacement. int CDisp8 = 0; if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { - EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, CDisp8 - Disp.getImm()); return; } } // Otherwise, emit the most general non-SIB encoding: [REG+disp32] - EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); unsigned Opcode = MI.getOpcode(); unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax : X86::reloc_signed_4byte; - EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, + emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, Fixups); return; } // We need a SIB byte, so start by outputting the ModR/M byte first - assert(IndexReg.getReg() != X86::ESP && - IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + assert(IndexReg.getReg() != X86::ESP && IndexReg.getReg() != X86::RSP && + "Cannot use ESP as index reg!"); bool ForceDisp32 = false; - bool ForceDisp8 = false; + bool ForceDisp8 = false; int CDisp8 = 0; int ImmOffset = 0; if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. - EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS); ForceDisp32 = true; } else if (!Disp.isImm()) { // Emit the normal disp32 encoding. - EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS); ForceDisp32 = true; } else if (Disp.getImm() == 0 && // Base reg can't be anything that ends up with '5' as the base // reg, it is the magic [*] nomenclature that indicates no base. BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte - EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS); } else if (!HasEVEX && isDisp8(Disp.getImm())) { // Emit the disp8 encoding. - EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { // Emit the disp8 encoding. - EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP ImmOffset = CDisp8 - Disp.getImm(); } else { // Emit the normal disp32 encoding. - EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS); } // Calculate what the SS field value should be... - static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; + static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3}; unsigned SS = SSTable[Scale.getImm()]; if (BaseReg == 0) { @@ -617,30 +610,133 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // Manual 2A, table 2-7. The displacement has already been output. unsigned IndexRegNo; if (IndexReg.getReg()) - IndexRegNo = GetX86RegNum(IndexReg); + IndexRegNo = getX86RegNum(IndexReg); else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) IndexRegNo = 4; - EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS); + emitSIBByte(SS, IndexRegNo, 5, CurByte, OS); } else { unsigned IndexRegNo; if (IndexReg.getReg()) - IndexRegNo = GetX86RegNum(IndexReg); + IndexRegNo = getX86RegNum(IndexReg); else - IndexRegNo = 4; // For example [ESP+1*<noreg>+4] - EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS); + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), CurByte, OS); } // Do we need to output a displacement? if (ForceDisp8) - EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + ImmOffset); else if (ForceDisp32 || Disp.getImm() != 0) - EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, Fixups); } -/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix +void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, + unsigned &CurByte, bool &Rex, + const MCInst &MI, const MCInstrDesc &Desc, + const MCSubtargetInfo &STI, + raw_ostream &OS) const { + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); + if (MemoryOperand != -1) + MemoryOperand += CurOp; + + // Emit segment override opcode prefix as needed. + if (MemoryOperand >= 0) + emitSegmentOverridePrefix(CurByte, MemoryOperand + X86::AddrSegmentReg, MI, + OS); + + // Emit the repeat opcode prefix as needed. + unsigned Flags = MI.getFlags(); + if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT) + emitByte(0xF3, CurByte, OS); + if (Flags & X86::IP_HAS_REPEAT_NE) + emitByte(0xF2, CurByte, OS); + + // Emit the address size opcode prefix as needed. + bool need_address_override; + uint64_t AdSize = TSFlags & X86II::AdSizeMask; + if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) || + (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) || + (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) { + need_address_override = true; + } else if (MemoryOperand < 0) { + need_address_override = false; + } else if (STI.hasFeature(X86::Mode64Bit)) { + assert(!is16BitMemOperand(MI, MemoryOperand, STI)); + need_address_override = is32BitMemOperand(MI, MemoryOperand); + } else if (STI.hasFeature(X86::Mode32Bit)) { + assert(!is64BitMemOperand(MI, MemoryOperand)); + need_address_override = is16BitMemOperand(MI, MemoryOperand, STI); + } else { + assert(STI.hasFeature(X86::Mode16Bit)); + assert(!is64BitMemOperand(MI, MemoryOperand)); + need_address_override = !is16BitMemOperand(MI, MemoryOperand, STI); + } + + if (need_address_override) + emitByte(0x67, CurByte, OS); + + // Encoding type for this instruction. + uint64_t Encoding = TSFlags & X86II::EncodingMask; + if (Encoding == 0) + Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS); + else + emitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + + uint64_t Form = TSFlags & X86II::FormMask; + switch (Form) { + default: + break; + case X86II::RawFrmDstSrc: { + unsigned siReg = MI.getOperand(1).getReg(); + assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) || + (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) || + (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) && + "SI and DI register sizes do not match"); + // Emit segment override opcode prefix as needed (not for %ds). + if (MI.getOperand(2).getReg() != X86::DS) + emitSegmentOverridePrefix(CurByte, 2, MI, OS); + // Emit AdSize prefix as needed. + if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) || + (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI)) + emitByte(0x67, CurByte, OS); + CurOp += 3; // Consume operands. + break; + } + case X86II::RawFrmSrc: { + unsigned siReg = MI.getOperand(0).getReg(); + // Emit segment override opcode prefix as needed (not for %ds). + if (MI.getOperand(1).getReg() != X86::DS) + emitSegmentOverridePrefix(CurByte, 1, MI, OS); + // Emit AdSize prefix as needed. + if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) || + (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI)) + emitByte(0x67, CurByte, OS); + CurOp += 2; // Consume operands. + break; + } + case X86II::RawFrmDst: { + unsigned siReg = MI.getOperand(0).getReg(); + // Emit AdSize prefix as needed. + if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) || + (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI)) + emitByte(0x67, CurByte, OS); + ++CurOp; // Consume operand. + break; + } + case X86II::RawFrmMemOffs: { + // Emit segment override opcode prefix as needed. + emitSegmentOverridePrefix(CurByte, 1, MI, OS); + break; + } + } +} + +/// emitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix /// called VEX. -void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, +void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { @@ -690,13 +786,26 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0b01010: XOP map select - 0Ah instructions with imm dword uint8_t VEX_5M; switch (TSFlags & X86II::OpMapMask) { - default: llvm_unreachable("Invalid prefix!"); - case X86II::TB: VEX_5M = 0x1; break; // 0F - case X86II::T8: VEX_5M = 0x2; break; // 0F 38 - case X86II::TA: VEX_5M = 0x3; break; // 0F 3A - case X86II::XOP8: VEX_5M = 0x8; break; - case X86II::XOP9: VEX_5M = 0x9; break; - case X86II::XOPA: VEX_5M = 0xA; break; + default: + llvm_unreachable("Invalid prefix!"); + case X86II::TB: + VEX_5M = 0x1; + break; // 0F + case X86II::T8: + VEX_5M = 0x2; + break; // 0F 38 + case X86II::TA: + VEX_5M = 0x3; + break; // 0F 3A + case X86II::XOP8: + VEX_5M = 0x8; + break; + case X86II::XOP9: + VEX_5M = 0x9; + break; + case X86II::XOPA: + VEX_5M = 0xA; + break; } // VEX_4V (VEX vvvv field): a register specifier @@ -724,9 +833,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // uint8_t VEX_PP = 0; switch (TSFlags & X86II::OpPrefixMask) { - case X86II::PD: VEX_PP = 0x1; break; // 66 - case X86II::XS: VEX_PP = 0x2; break; // F3 - case X86II::XD: VEX_PP = 0x3; break; // F2 + case X86II::PD: + VEX_PP = 0x1; + break; // 66 + case X86II::XS: + VEX_PP = 0x2; + break; // F3 + case X86II::XD: + VEX_PP = 0x3; + break; // F2 } // EVEX_U @@ -751,7 +866,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned CurOp = X86II::getOperandBias(Desc); switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!"); + default: + llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!"); case X86II::RawFrm: break; case X86II::MRMDestMem: { @@ -762,7 +878,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); VEX_B = ~(BaseRegEnc >> 3) & 1; - unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + unsigned IndexRegEnc = + getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV. EVEX_V2 = ~(IndexRegEnc >> 4) & 1; @@ -807,7 +924,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); VEX_B = ~(BaseRegEnc >> 3) & 1; - unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + unsigned IndexRegEnc = + getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV. EVEX_V2 = ~(IndexRegEnc >> 4) & 1; @@ -822,7 +940,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); VEX_B = ~(BaseRegEnc >> 3) & 1; - unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + unsigned IndexRegEnc = + getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf; @@ -838,14 +957,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); VEX_B = ~(BaseRegEnc >> 3) & 1; - unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + unsigned IndexRegEnc = + getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; break; } - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { + case X86II::MRM0m: + case X86II::MRM1m: + case X86II::MRM2m: + case X86II::MRM3m: + case X86II::MRM4m: + case X86II::MRM5m: + case X86II::MRM6m: + case X86II::MRM7m: { // MRM[0-9]m instructions forms: // MemAddr // src1(VEX_4V), MemAddr @@ -860,7 +984,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg); VEX_B = ~(BaseRegEnc >> 3) & 1; - unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); + unsigned IndexRegEnc = + getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV. EVEX_V2 = ~(IndexRegEnc >> 4) & 1; @@ -894,7 +1019,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if (EVEX_b) { if (HasEVEX_RC) { - unsigned RcOperand = NumOps-1; + unsigned RcOperand = NumOps - 1; assert(RcOperand >= CurOp); EVEX_rc = MI.getOperand(RcOperand).getImm(); assert(EVEX_rc <= 3 && "Invalid rounding control!"); @@ -956,10 +1081,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EncodeRC = true; break; } - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: { + case X86II::MRM0r: + case X86II::MRM1r: + case X86II::MRM2r: + case X86II::MRM3r: + case X86II::MRM4r: + case X86II::MRM5r: + case X86II::MRM6r: + case X86II::MRM7r: { // MRM0r-MRM7r instructions forms: // dst(VEX_4V), src(ModR/M), imm8 if (HasVEX_4V) { @@ -996,17 +1125,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); // Can we use the 2 byte VEX prefix? - if (!(MI.getFlags() & X86::IP_USE_VEX3) && - Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { - EmitByte(0xC5, CurByte, OS); - EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX && + VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { + emitByte(0xC5, CurByte, OS); + emitByte(LastByte | (VEX_R << 7), CurByte, OS); return; } // 3 byte VEX prefix - EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS); - EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS); + emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + emitByte(LastByte | (VEX_W << 7), CurByte, OS); } else { assert(Encoding == X86II::EVEX && "unknown encoding!"); // EVEX opcode prefix can have 4 bytes @@ -1014,39 +1143,30 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // +-----+ +--------------+ +-------------------+ +------------------------+ // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | // +-----+ +--------------+ +-------------------+ +------------------------+ - assert((VEX_5M & 0x3) == VEX_5M - && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); - - EmitByte(0x62, CurByte, OS); - EmitByte((VEX_R << 7) | - (VEX_X << 6) | - (VEX_B << 5) | - (EVEX_R2 << 4) | - VEX_5M, CurByte, OS); - EmitByte((VEX_W << 7) | - (VEX_4V << 3) | - (EVEX_U << 2) | - VEX_PP, CurByte, OS); + assert((VEX_5M & 0x3) == VEX_5M && + "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); + + emitByte(0x62, CurByte, OS); + emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) | + VEX_5M, + CurByte, OS); + emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, CurByte, + OS); if (EncodeRC) - EmitByte((EVEX_z << 7) | - (EVEX_rc << 5) | - (EVEX_b << 4) | - (EVEX_V2 << 3) | - EVEX_aaa, CurByte, OS); + emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) | + EVEX_aaa, + CurByte, OS); else - EmitByte((EVEX_z << 7) | - (EVEX_L2 << 6) | - (VEX_L << 5) | - (EVEX_b << 4) | - (EVEX_V2 << 3) | - EVEX_aaa, CurByte, OS); + emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) | + (EVEX_V2 << 3) | EVEX_aaa, + CurByte, OS); } } -/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 -/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand -/// size, and 3) use of X86-64 extended registers. -uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, +/// Determine if the MCInst has to be encoded with a X86-64 REX prefix which +/// specifies 1) 64-bit instructions, 2) non-default operand size, and 3) use +/// of X86-64 extended registers. +uint8_t X86MCCodeEmitter::determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand, const MCInstrDesc &Desc) const { uint8_t REX = 0; @@ -1055,7 +1175,8 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, if (TSFlags & X86II::REX_W) REX |= 1 << 3; // set REX.W - if (MI.getNumOperands() == 0) return REX; + if (MI.getNumOperands() == 0) + return REX; unsigned NumOps = MI.getNumOperands(); unsigned CurOp = X86II::getOperandBias(Desc); @@ -1063,12 +1184,13 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. for (unsigned i = CurOp; i != NumOps; ++i) { const MCOperand &MO = MI.getOperand(i); - if (!MO.isReg()) continue; + if (!MO.isReg()) + continue; unsigned Reg = MO.getReg(); if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) UsesHighByteReg = true; if (X86II::isX86_64NonExtLowByteReg(Reg)) - // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything + // FIXME: The caller of determineREXPrefix slaps this prefix onto anything // that returns non-zero. REX |= 0x40; // REX fixed encoding prefix } @@ -1084,9 +1206,9 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, break; case X86II::MRMSrcMem: case X86II::MRMSrcMemCC: - REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R - REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B - REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X CurOp += X86::AddrNumOperands; break; case X86II::MRMDestReg: @@ -1094,57 +1216,82 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R break; case X86II::MRMDestMem: - REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B - REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X CurOp += X86::AddrNumOperands; REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R break; - case X86II::MRMXmCC: case X86II::MRMXm: - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: - REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B - REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X + case X86II::MRMXmCC: + case X86II::MRMXm: + case X86II::MRM0m: + case X86II::MRM1m: + case X86II::MRM2m: + case X86II::MRM3m: + case X86II::MRM4m: + case X86II::MRM5m: + case X86II::MRM6m: + case X86II::MRM7m: + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X break; - case X86II::MRMXrCC: case X86II::MRMXr: - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: + case X86II::MRMXrCC: + case X86II::MRMXr: + case X86II::MRM0r: + case X86II::MRM1r: + case X86II::MRM2r: + case X86II::MRM3r: + case X86II::MRM4r: + case X86II::MRM5r: + case X86II::MRM6r: + case X86II::MRM7r: REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B break; } if (REX && UsesHighByteReg) - report_fatal_error("Cannot encode high byte register in REX-prefixed instruction"); + report_fatal_error( + "Cannot encode high byte register in REX-prefixed instruction"); return REX; } -/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed -void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte, +/// Emit segment override opcode prefix as needed. +void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand, const MCInst &MI, raw_ostream &OS) const { // Check for explicit segment override on memory operand. switch (MI.getOperand(SegOperand).getReg()) { - default: llvm_unreachable("Unknown segment register!"); - case 0: break; - case X86::CS: EmitByte(0x2E, CurByte, OS); break; - case X86::SS: EmitByte(0x36, CurByte, OS); break; - case X86::DS: EmitByte(0x3E, CurByte, OS); break; - case X86::ES: EmitByte(0x26, CurByte, OS); break; - case X86::FS: EmitByte(0x64, CurByte, OS); break; - case X86::GS: EmitByte(0x65, CurByte, OS); break; + default: + llvm_unreachable("Unknown segment register!"); + case 0: + break; + case X86::CS: + emitByte(0x2E, CurByte, OS); + break; + case X86::SS: + emitByte(0x36, CurByte, OS); + break; + case X86::DS: + emitByte(0x3E, CurByte, OS); + break; + case X86::ES: + emitByte(0x26, CurByte, OS); + break; + case X86::FS: + emitByte(0x64, CurByte, OS); + break; + case X86::GS: + emitByte(0x65, CurByte, OS); + break; } } /// Emit all instruction prefixes prior to the opcode. /// -/// MemOperand is the operand # of the start of a memory operand if present. If -/// Not present, it is -1. +/// \param MemOperand the operand # of the start of a memory operand if present. +/// If not present, it is -1. /// -/// Returns true if a REX prefix was used. +/// \returns true if a REX prefix was used. bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, @@ -1152,35 +1299,35 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS) const { bool Ret = false; // Emit the operand size opcode prefix as needed. - if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32 - : X86II::OpSize16)) - EmitByte(0x66, CurByte, OS); + if ((TSFlags & X86II::OpSizeMask) == + (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16)) + emitByte(0x66, CurByte, OS); // Emit the LOCK opcode prefix. if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK) - EmitByte(0xF0, CurByte, OS); + emitByte(0xF0, CurByte, OS); // Emit the NOTRACK opcode prefix. if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK) - EmitByte(0x3E, CurByte, OS); + emitByte(0x3E, CurByte, OS); switch (TSFlags & X86II::OpPrefixMask) { - case X86II::PD: // 66 - EmitByte(0x66, CurByte, OS); + case X86II::PD: // 66 + emitByte(0x66, CurByte, OS); break; - case X86II::XS: // F3 - EmitByte(0xF3, CurByte, OS); + case X86II::XS: // F3 + emitByte(0xF3, CurByte, OS); break; - case X86II::XD: // F2 - EmitByte(0xF2, CurByte, OS); + case X86II::XD: // F2 + emitByte(0xF2, CurByte, OS); break; } // Handle REX prefix. // FIXME: Can this come before F2 etc to simplify emission? - if (is64BitMode(STI)) { - if (uint8_t REX = DetermineREXPrefix(MI, TSFlags, MemOperand, Desc)) { - EmitByte(0x40 | REX, CurByte, OS); + if (STI.hasFeature(X86::Mode64Bit)) { + if (uint8_t REX = determineREXPrefix(MI, TSFlags, MemOperand, Desc)) { + emitByte(0x40 | REX, CurByte, OS); Ret = true; } } else { @@ -1189,33 +1336,50 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 0x0F escape code must be emitted just before the opcode. switch (TSFlags & X86II::OpMapMask) { - case X86II::TB: // Two-byte opcode map - case X86II::T8: // 0F 38 - case X86II::TA: // 0F 3A - case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller. - EmitByte(0x0F, CurByte, OS); + case X86II::TB: // Two-byte opcode map + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller. + emitByte(0x0F, CurByte, OS); break; } switch (TSFlags & X86II::OpMapMask) { - case X86II::T8: // 0F 38 - EmitByte(0x38, CurByte, OS); + case X86II::T8: // 0F 38 + emitByte(0x38, CurByte, OS); break; - case X86II::TA: // 0F 3A - EmitByte(0x3A, CurByte, OS); + case X86II::TA: // 0F 3A + emitByte(0x3A, CurByte, OS); break; } return Ret; } -void X86MCCodeEmitter:: -encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS, + const MCSubtargetInfo &STI) const { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + uint64_t TSFlags = Desc.TSFlags; + + // Pseudo instructions don't get encoded. + if ((TSFlags & X86II::FormMask) == X86II::Pseudo) + return; + + unsigned CurOp = X86II::getOperandBias(Desc); + + // Keep track of the current byte being emitted. + unsigned CurByte = 0; + + bool Rex = false; + emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS); +} + +void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MCII.get(Opcode); uint64_t TSFlags = Desc.TSFlags; - unsigned Flags = MI.getFlags(); // Pseudo instructions don't get encoded. if ((TSFlags & X86II::FormMask) == X86II::Pseudo) @@ -1227,8 +1391,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, // Keep track of the current byte being emitted. unsigned CurByte = 0; - // Encoding type for this instruction. - uint64_t Encoding = TSFlags & X86II::EncodingMask; + bool Rex = false; + emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS); // It uses the VEX.VVVV field? bool HasVEX_4V = TSFlags & X86II::VEX_4V; @@ -1241,104 +1405,25 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, // Used if a register is encoded in 7:4 of immediate. unsigned I8RegNum = 0; - // Determine where the memory operand starts, if present. - int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); - if (MemoryOperand != -1) MemoryOperand += CurOp; - - // Emit segment override opcode prefix as needed. - if (MemoryOperand >= 0) - EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg, - MI, OS); - - // Emit the repeat opcode prefix as needed. - if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT) - EmitByte(0xF3, CurByte, OS); - if (Flags & X86::IP_HAS_REPEAT_NE) - EmitByte(0xF2, CurByte, OS); - - // Emit the address size opcode prefix as needed. - bool need_address_override; - uint64_t AdSize = TSFlags & X86II::AdSizeMask; - if ((is16BitMode(STI) && AdSize == X86II::AdSize32) || - (is32BitMode(STI) && AdSize == X86II::AdSize16) || - (is64BitMode(STI) && AdSize == X86II::AdSize32)) { - need_address_override = true; - } else if (MemoryOperand < 0) { - need_address_override = false; - } else if (is64BitMode(STI)) { - assert(!Is16BitMemOperand(MI, MemoryOperand, STI)); - need_address_override = Is32BitMemOperand(MI, MemoryOperand); - } else if (is32BitMode(STI)) { - assert(!Is64BitMemOperand(MI, MemoryOperand)); - need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI); - } else { - assert(is16BitMode(STI)); - assert(!Is64BitMemOperand(MI, MemoryOperand)); - need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI); - } - - if (need_address_override) - EmitByte(0x67, CurByte, OS); - - bool Rex = false; - if (Encoding == 0) - Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS); - else - EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); - uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow) - BaseOpcode = 0x0F; // Weird 3DNow! encoding. + BaseOpcode = 0x0F; // Weird 3DNow! encoding. unsigned OpcodeOffset = 0; uint64_t Form = TSFlags & X86II::FormMask; switch (Form) { - default: errs() << "FORM: " << Form << "\n"; + default: + errs() << "FORM: " << Form << "\n"; llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!"); case X86II::Pseudo: llvm_unreachable("Pseudo instruction shouldn't be emitted"); - case X86II::RawFrmDstSrc: { - unsigned siReg = MI.getOperand(1).getReg(); - assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) || - (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) || - (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) && - "SI and DI register sizes do not match"); - // Emit segment override opcode prefix as needed (not for %ds). - if (MI.getOperand(2).getReg() != X86::DS) - EmitSegmentOverridePrefix(CurByte, 2, MI, OS); - // Emit AdSize prefix as needed. - if ((!is32BitMode(STI) && siReg == X86::ESI) || - (is32BitMode(STI) && siReg == X86::SI)) - EmitByte(0x67, CurByte, OS); - CurOp += 3; // Consume operands. - EmitByte(BaseOpcode, CurByte, OS); - break; - } - case X86II::RawFrmSrc: { - unsigned siReg = MI.getOperand(0).getReg(); - // Emit segment override opcode prefix as needed (not for %ds). - if (MI.getOperand(1).getReg() != X86::DS) - EmitSegmentOverridePrefix(CurByte, 1, MI, OS); - // Emit AdSize prefix as needed. - if ((!is32BitMode(STI) && siReg == X86::ESI) || - (is32BitMode(STI) && siReg == X86::SI)) - EmitByte(0x67, CurByte, OS); - CurOp += 2; // Consume operands. - EmitByte(BaseOpcode, CurByte, OS); + case X86II::RawFrmDstSrc: + case X86II::RawFrmSrc: + case X86II::RawFrmDst: + emitByte(BaseOpcode, CurByte, OS); break; - } - case X86II::RawFrmDst: { - unsigned siReg = MI.getOperand(0).getReg(); - // Emit AdSize prefix as needed. - if ((!is32BitMode(STI) && siReg == X86::EDI) || - (is32BitMode(STI) && siReg == X86::DI)) - EmitByte(0x67, CurByte, OS); - ++CurOp; // Consume operand. - EmitByte(BaseOpcode, CurByte, OS); - break; - } case X86II::AddCCFrm: { // This will be added to the opcode in the fallthrough. OpcodeOffset = MI.getOperand(NumOps - 1).getImm(); @@ -1346,49 +1431,47 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, --NumOps; // Drop the operand from the end. LLVM_FALLTHROUGH; case X86II::RawFrm: - EmitByte(BaseOpcode + OpcodeOffset, CurByte, OS); + emitByte(BaseOpcode + OpcodeOffset, CurByte, OS); - if (!is64BitMode(STI) || !isPCRel32Branch(MI)) + if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII)) break; const MCOperand &Op = MI.getOperand(CurOp++); - EmitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags), + emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags), MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS, Fixups); break; } case X86II::RawFrmMemOffs: - // Emit segment override opcode prefix as needed. - EmitSegmentOverridePrefix(CurByte, 1, MI, OS); - EmitByte(BaseOpcode, CurByte, OS); - EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + emitByte(BaseOpcode, CurByte, OS); + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), CurByte, OS, Fixups); ++CurOp; // skip segment operand break; case X86II::RawFrmImm8: - EmitByte(BaseOpcode, CurByte, OS); - EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + emitByte(BaseOpcode, CurByte, OS); + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), CurByte, OS, Fixups); - EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte, + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); break; case X86II::RawFrmImm16: - EmitByte(BaseOpcode, CurByte, OS); - EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + emitByte(BaseOpcode, CurByte, OS); + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), CurByte, OS, Fixups); - EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte, + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups); break; case X86II::AddRegFrm: - EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS); + emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), CurByte, OS); break; case X86II::MRMDestReg: { - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); unsigned SrcRegNum = CurOp + 1; if (HasEVEX_K) // Skip writemask @@ -1397,13 +1480,13 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; - EmitRegModRMByte(MI.getOperand(CurOp), - GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS); + emitRegModRMByte(MI.getOperand(CurOp), + getX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS); CurOp = SrcRegNum + 1; break; } case X86II::MRMDestMem: { - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); unsigned SrcRegNum = CurOp + X86::AddrNumOperands; if (HasEVEX_K) // Skip writemask @@ -1412,13 +1495,13 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; - emitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), TSFlags, + emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags, Rex, CurByte, OS, Fixups, STI); CurOp = SrcRegNum + 1; break; } case X86II::MRMSrcReg: { - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); unsigned SrcRegNum = CurOp + 1; if (HasEVEX_K) // Skip writemask @@ -1427,8 +1510,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; - EmitRegModRMByte(MI.getOperand(SrcRegNum), - GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + emitRegModRMByte(MI.getOperand(SrcRegNum), + getX86RegNum(MI.getOperand(CurOp)), CurByte, OS); CurOp = SrcRegNum + 1; if (HasVEX_I8Reg) I8RegNum = getX86RegEncoding(MI, CurOp++); @@ -1438,17 +1521,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, break; } case X86II::MRMSrcReg4VOp3: { - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); unsigned SrcRegNum = CurOp + 1; - EmitRegModRMByte(MI.getOperand(SrcRegNum), - GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + emitRegModRMByte(MI.getOperand(SrcRegNum), + getX86RegNum(MI.getOperand(CurOp)), CurByte, OS); CurOp = SrcRegNum + 1; ++CurOp; // Encoded in VEX.VVVV break; } case X86II::MRMSrcRegOp4: { - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); unsigned SrcRegNum = CurOp + 1; // Skip 1st src (which is encoded in VEX_VVVV) @@ -1458,8 +1541,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg"); I8RegNum = getX86RegEncoding(MI, SrcRegNum++); - EmitRegModRMByte(MI.getOperand(SrcRegNum), - GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + emitRegModRMByte(MI.getOperand(SrcRegNum), + getX86RegNum(MI.getOperand(CurOp)), CurByte, OS); CurOp = SrcRegNum + 1; break; } @@ -1468,24 +1551,24 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned SecondOp = CurOp++; unsigned CC = MI.getOperand(CurOp++).getImm(); - EmitByte(BaseOpcode + CC, CurByte, OS); + emitByte(BaseOpcode + CC, CurByte, OS); - EmitRegModRMByte(MI.getOperand(SecondOp), - GetX86RegNum(MI.getOperand(FirstOp)), CurByte, OS); + emitRegModRMByte(MI.getOperand(SecondOp), + getX86RegNum(MI.getOperand(FirstOp)), CurByte, OS); break; } case X86II::MRMSrcMem: { - unsigned FirstMemOp = CurOp+1; + unsigned FirstMemOp = CurOp + 1; if (HasEVEX_K) // Skip writemask ++FirstMemOp; if (HasVEX_4V) - ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); - emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)), TSFlags, Rex, CurByte, OS, Fixups, STI); CurOp = FirstMemOp + X86::AddrNumOperands; if (HasVEX_I8Reg) @@ -1493,28 +1576,28 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, break; } case X86II::MRMSrcMem4VOp3: { - unsigned FirstMemOp = CurOp+1; + unsigned FirstMemOp = CurOp + 1; - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); - emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)), TSFlags, Rex, CurByte, OS, Fixups, STI); CurOp = FirstMemOp + X86::AddrNumOperands; ++CurOp; // Encoded in VEX.VVVV. break; } case X86II::MRMSrcMemOp4: { - unsigned FirstMemOp = CurOp+1; + unsigned FirstMemOp = CurOp + 1; - ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). // Capture second register source (encoded in Imm[7:4]) assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg"); I8RegNum = getX86RegEncoding(MI, FirstMemOp++); - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); - emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)), TSFlags, Rex, CurByte, OS, Fixups, STI); CurOp = FirstMemOp + X86::AddrNumOperands; break; @@ -1525,9 +1608,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = FirstMemOp + X86::AddrNumOperands; unsigned CC = MI.getOperand(CurOp++).getImm(); - EmitByte(BaseOpcode + CC, CurByte, OS); + emitByte(BaseOpcode + CC, CurByte, OS); - emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(RegOp)), + emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)), TSFlags, Rex, CurByte, OS, Fixups, STI); break; } @@ -1536,24 +1619,28 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned RegOp = CurOp++; unsigned CC = MI.getOperand(CurOp++).getImm(); - EmitByte(BaseOpcode + CC, CurByte, OS); - EmitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS); + emitByte(BaseOpcode + CC, CurByte, OS); + emitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS); break; } case X86II::MRMXr: - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: + case X86II::MRM0r: + case X86II::MRM1r: + case X86II::MRM2r: + case X86II::MRM3r: + case X86II::MRM4r: + case X86II::MRM5r: + case X86II::MRM6r: + case X86II::MRM7r: if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). ++CurOp; if (HasEVEX_K) // Skip writemask ++CurOp; - EmitByte(BaseOpcode, CurByte, OS); - EmitRegModRMByte(MI.getOperand(CurOp++), - (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r, - CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); + emitRegModRMByte(MI.getOperand(CurOp++), + (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, CurByte, + OS); break; case X86II::MRMXmCC: { @@ -1561,52 +1648,98 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = FirstMemOp + X86::AddrNumOperands; unsigned CC = MI.getOperand(CurOp++).getImm(); - EmitByte(BaseOpcode + CC, CurByte, OS); + emitByte(BaseOpcode + CC, CurByte, OS); emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI); break; } case X86II::MRMXm: - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRM0m: + case X86II::MRM1m: + case X86II::MRM2m: + case X86II::MRM3m: + case X86II::MRM4m: + case X86II::MRM5m: + case X86II::MRM6m: + case X86II::MRM7m: if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). ++CurOp; if (HasEVEX_K) // Skip writemask ++CurOp; - EmitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); emitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags, Rex, CurByte, OS, Fixups, STI); CurOp += X86::AddrNumOperands; break; - case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: - case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: - case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: - case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: - case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: - case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: - case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: - case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: - case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: - case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: - case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: - case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: - case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: - case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: - case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_C0: + case X86II::MRM_C1: + case X86II::MRM_C2: + case X86II::MRM_C3: + case X86II::MRM_C4: + case X86II::MRM_C5: + case X86II::MRM_C6: + case X86II::MRM_C7: + case X86II::MRM_C8: + case X86II::MRM_C9: + case X86II::MRM_CA: + case X86II::MRM_CB: + case X86II::MRM_CC: + case X86II::MRM_CD: + case X86II::MRM_CE: + case X86II::MRM_CF: + case X86II::MRM_D0: + case X86II::MRM_D1: + case X86II::MRM_D2: + case X86II::MRM_D3: + case X86II::MRM_D4: + case X86II::MRM_D5: + case X86II::MRM_D6: + case X86II::MRM_D7: + case X86II::MRM_D8: + case X86II::MRM_D9: + case X86II::MRM_DA: + case X86II::MRM_DB: + case X86II::MRM_DC: + case X86II::MRM_DD: + case X86II::MRM_DE: + case X86II::MRM_DF: + case X86II::MRM_E0: + case X86II::MRM_E1: + case X86II::MRM_E2: + case X86II::MRM_E3: + case X86II::MRM_E4: + case X86II::MRM_E5: + case X86II::MRM_E6: + case X86II::MRM_E7: + case X86II::MRM_E8: + case X86II::MRM_E9: + case X86II::MRM_EA: + case X86II::MRM_EB: + case X86II::MRM_EC: + case X86II::MRM_ED: + case X86II::MRM_EE: + case X86II::MRM_EF: + case X86II::MRM_F0: + case X86II::MRM_F1: + case X86II::MRM_F2: + case X86II::MRM_F3: + case X86II::MRM_F4: + case X86II::MRM_F5: + case X86II::MRM_F6: + case X86II::MRM_F7: + case X86II::MRM_F8: + case X86II::MRM_F9: + case X86II::MRM_FA: + case X86II::MRM_FB: + case X86II::MRM_FC: + case X86II::MRM_FD: + case X86II::MRM_FE: case X86II::MRM_FF: - EmitByte(BaseOpcode, CurByte, OS); - EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS); + emitByte(BaseOpcode, CurByte, OS); + emitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS); break; } @@ -1620,21 +1753,21 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, assert(Val < 16 && "Immediate operand value out of range"); I8RegNum |= Val; } - EmitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1, + emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); } else { // If there is a remaining operand, it must be a trailing immediate. Emit it // according to the right size for the instruction. Some instructions // (SSE4a extrq and insertq) have two trailing immediates. while (CurOp != NumOps && NumOps - CurOp <= 2) { - EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), CurByte, OS, Fixups); } } if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow) - EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); + emitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); #ifndef NDEBUG // FIXME: Verify. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index ced9eacc8b97..049a3a815984 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -290,12 +290,9 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { std::string ArchFS = X86_MC::ParseX86Triple(TT); - if (!FS.empty()) { - if (!ArchFS.empty()) - ArchFS = (Twine(ArchFS) + "," + FS).str(); - else - ArchFS = FS; - } + assert(!ArchFS.empty() && "Failed to parse X86 triple"); + if (!FS.empty()) + ArchFS = (Twine(ArchFS) + "," + FS).str(); std::string CPUName = CPU; if (CPUName.empty()) @@ -323,7 +320,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) { } static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, - const Triple &TheTriple) { + const Triple &TheTriple, + const MCTargetOptions &Options) { bool is64Bit = TheTriple.getArch() == Triple::x86_64; MCAsmInfo *MAI; @@ -554,7 +552,7 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { } // Force static initialization. -extern "C" void LLVMInitializeX86TargetMC() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMC() { for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) { // Register the MC asm info. RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); diff --git a/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index 47c41626a666..18cda8f591c3 100644 --- a/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -19,7 +19,7 @@ Target &llvm::getTheX86_64Target() { return TheX86_64Target; } -extern "C" void LLVMInitializeX86TargetInfo() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetInfo() { RegisterTarget<Triple::x86, /*HasJIT=*/true> X( getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above", "X86"); diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 6840fc12751d..0481a40d462a 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -150,6 +150,18 @@ void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); + +namespace X86AS { +enum : unsigned { + GS = 256, + FS = 257, + SS = 258, + PTR32_SPTR = 270, + PTR32_UPTR = 271, + PTR64 = 272 +}; +} // End X86AS namespace + } // End llvm namespace #endif diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index d8631aca2734..a2b11d55f650 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -304,12 +304,12 @@ def FeatureFastVariableShuffle : SubtargetFeature<"fast-variable-shuffle", "HasFastVariableShuffle", "true", "Shuffles with variable masks are fast">; -// On some X86 processors, there is no performance hazard to writing only the -// lower parts of a YMM or ZMM register without clearing the upper part. -def FeatureFastPartialYMMorZMMWrite - : SubtargetFeature<"fast-partial-ymm-or-zmm-write", - "HasFastPartialYMMorZMMWrite", - "true", "Partial writes to YMM/ZMM registers are fast">; +// On some X86 processors, a vzeroupper instruction should be inserted after +// using ymm/zmm registers before executing code that may use SSE instructions. +def FeatureInsertVZEROUPPER + : SubtargetFeature<"vzeroupper", + "InsertVZEROUPPER", + "true", "Should insert vzeroupper instructions">; // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if // vector FSQRT has higher throughput than the corresponding NR code. @@ -386,6 +386,10 @@ def FeaturePrefer256Bit : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", "Prefer 256-bit AVX instructions">; +def FeaturePreferMaskRegisters + : SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true", + "Prefer AVX512 mask registers over PTEST/MOVMSK">; + // Lower indirect calls using a special construct called a `retpoline` to // mitigate potential Spectre v2 attacks against them. def FeatureRetpolineIndirectCalls @@ -439,7 +443,7 @@ def FeatureFastHorizontalOps : SubtargetFeature< "fast-hops", "HasFastHorizontalOps", "true", "Prefer horizontal vector math instructions (haddp, phsub, etc.) over " - "normal vector instructions with shuffles", [FeatureSSE3]>; + "normal vector instructions with shuffles">; def FeatureFastScalarShiftMasks : SubtargetFeature< @@ -451,6 +455,10 @@ def FeatureFastVectorShiftMasks "fast-vector-shift-masks", "HasFastVectorShiftMasks", "true", "Prefer a left/right vector logical shift pair over a shift+and pair">; +def FeatureUseGLMDivSqrtCosts + : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true", + "Use Goldmont specific floating point div/sqrt costs">; + // Merge branches using three-way conditional code. def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "ThreewayBranchProfitable", "true", @@ -465,12 +473,6 @@ def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">; // Silvermont def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">; -// Goldmont -def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">; -// Goldmont Plus -def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">; -// Tremont -def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">; //===----------------------------------------------------------------------===// // Register File Description @@ -499,6 +501,7 @@ include "X86SchedHaswell.td" include "X86SchedBroadwell.td" include "X86ScheduleSLM.td" include "X86ScheduleZnver1.td" +include "X86ScheduleZnver2.td" include "X86ScheduleBdVer2.td" include "X86ScheduleBtVer2.td" include "X86SchedSkylakeClient.td" @@ -521,7 +524,8 @@ def ProcessorFeatures { FeatureCMPXCHG16B, FeaturePOPCNT, FeatureLAHFSAHF, - FeatureMacroFusion]; + FeatureMacroFusion, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> NHMSpecificFeatures = []; list<SubtargetFeature> NHMFeatures = !listconcat(NHMInheritableFeatures, NHMSpecificFeatures); @@ -701,7 +705,8 @@ def ProcessorFeatures { FeatureCMPXCHG16B, FeatureMOVBE, FeatureSlowTwoMemOps, - FeatureLAHFSAHF]; + FeatureLAHFSAHF, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom, FeatureSlowUAMem16, FeatureLEAForSP, @@ -739,7 +744,7 @@ def ProcessorFeatures { FeatureXSAVES, FeatureCLFLUSHOPT, FeatureFSGSBase]; - list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM, + list<SubtargetFeature> GLMSpecificFeatures = [FeatureUseGLMDivSqrtCosts, FeaturePOPCNTFalseDeps]; list<SubtargetFeature> GLMInheritableFeatures = !listconcat(SLMInheritableFeatures, GLMAdditionalFeatures); @@ -750,7 +755,7 @@ def ProcessorFeatures { list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE, FeatureRDPID, FeatureSGX]; - list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP]; + list<SubtargetFeature> GLPSpecificFeatures = [FeatureUseGLMDivSqrtCosts]; list<SubtargetFeature> GLPInheritableFeatures = !listconcat(GLMInheritableFeatures, GLPAdditionalFeatures); list<SubtargetFeature> GLPFeatures = @@ -762,7 +767,7 @@ def ProcessorFeatures { FeatureMOVDIRI, FeatureMOVDIR64B, FeatureWAITPKG]; - list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM]; + list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts]; list<SubtargetFeature> TRMFeatures = !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures, TRMSpecificFeatures); @@ -801,8 +806,8 @@ def ProcessorFeatures { FeatureBMI2, FeatureFMA, FeaturePRFCHW, + FeaturePreferMaskRegisters, FeatureSlowTwoMemOps, - FeatureFastPartialYMMorZMMWrite, FeatureHasFastGather, FeatureSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features @@ -823,7 +828,8 @@ def ProcessorFeatures { FeatureLAHFSAHF, FeatureCMOV, Feature64Bit, - FeatureFastScalarShiftMasks]; + FeatureFastScalarShiftMasks, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures; // Bobcat @@ -845,7 +851,9 @@ def ProcessorFeatures { FeatureFast15ByteNOP, FeatureFastScalarShiftMasks, FeatureFastVectorShiftMasks]; - list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures; + list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER]; + list<SubtargetFeature> BtVer1Features = + !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures); // Jaguar list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX, @@ -858,7 +866,6 @@ def ProcessorFeatures { FeatureXSAVEOPT]; list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT, FeatureFastBEXTR, - FeatureFastPartialYMMorZMMWrite, FeatureFastHorizontalOps]; list<SubtargetFeature> BtVer2InheritableFeatures = !listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures); @@ -886,7 +893,8 @@ def ProcessorFeatures { FeatureLAHFSAHF, FeatureFast11ByteNOP, FeatureFastScalarShiftMasks, - FeatureBranchFusion]; + FeatureBranchFusion, + FeatureInsertVZEROUPPER]; list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures; // PileDriver @@ -949,6 +957,7 @@ def ProcessorFeatures { FeatureSHA, FeatureSSE4A, FeatureSlowSHLD, + FeatureInsertVZEROUPPER, FeatureX87, FeatureXSAVE, FeatureXSAVEC, @@ -971,28 +980,32 @@ class Proc<string Name, list<SubtargetFeature> Features> // NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled // if i386/i486 is specifically requested. def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B]>; -def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; + FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; +def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16, + FeatureInsertVZEROUPPER]>; +def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16, + FeatureInsertVZEROUPPER]>; def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B]>; + FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B]>; + FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, - FeatureCMPXCHG8B, FeatureMMX]>; + FeatureCMPXCHG8B, FeatureMMX, + FeatureInsertVZEROUPPER]>; def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureCMOV, FeatureNOPL]>; + FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>; def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureCMOV, FeatureFXSR, - FeatureNOPL]>; + FeatureNOPL, FeatureInsertVZEROUPPER]>; foreach P = ["pentium3", "pentium3m"] in { def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX, - FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>; + FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV, + FeatureInsertVZEROUPPER]>; } // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. @@ -1008,29 +1021,29 @@ foreach P = ["pentium3", "pentium3m"] in { def : ProcessorModel<"pentium-m", GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; foreach P = ["pentium4", "pentium4m"] in { def : ProcessorModel<P, GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; } // Intel Quark. -def : Proc<"lakemont", []>; +def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; // NetBurst. def : ProcessorModel<"prescott", GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, @@ -1041,7 +1054,8 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureFXSR, FeatureNOPL, Feature64Bit, - FeatureCMPXCHG16B + FeatureCMPXCHG16B, + FeatureInsertVZEROUPPER ]>; // Intel Core 2 Solo/Duo. @@ -1057,7 +1071,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ Feature64Bit, FeatureCMPXCHG16B, FeatureLAHFSAHF, - FeatureMacroFusion + FeatureMacroFusion, + FeatureInsertVZEROUPPER ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureX87, @@ -1071,7 +1086,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ Feature64Bit, FeatureCMPXCHG16B, FeatureLAHFSAHF, - FeatureMacroFusion + FeatureMacroFusion, + FeatureInsertVZEROUPPER ]>; // Atom CPUs. @@ -1138,35 +1154,36 @@ def : ProcessorModel<"tigerlake", SkylakeServerModel, // AMD CPUs. def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - FeatureMMX]>; + FeatureMMX, FeatureInsertVZEROUPPER]>; def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNow]>; + Feature3DNow, FeatureInsertVZEROUPPER]>; def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNow]>; + Feature3DNow, FeatureInsertVZEROUPPER]>; foreach P = ["athlon", "athlon-tbird"] in { def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, - Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>; + Feature3DNowA, FeatureNOPL, FeatureSlowSHLD, + FeatureInsertVZEROUPPER]>; } foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV, FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL, - FeatureSlowSHLD]>; + FeatureSlowSHLD, FeatureInsertVZEROUPPER]>; } foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD, FeatureCMOV, - FeatureFastScalarShiftMasks]>; + FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3, Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD, FeatureCMOV, Feature64Bit, - FeatureFastScalarShiftMasks]>; + FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>; } foreach P = ["amdfam10", "barcelona"] in { @@ -1188,17 +1205,20 @@ def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>; def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>; def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>; -def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>; +def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>; def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, - Feature3DNowA]>; - -def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; + Feature3DNowA, FeatureInsertVZEROUPPER]>; + +def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureInsertVZEROUPPER]>; +def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow, + FeatureInsertVZEROUPPER]>; +def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow, + FeatureInsertVZEROUPPER]>; def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureMMX, FeatureSSE1, FeatureFXSR, - FeatureCMOV]>; + FeatureCMOV, FeatureInsertVZEROUPPER]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -1221,7 +1241,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [ Feature64Bit, FeatureSlow3OpsLEA, FeatureSlowIncDec, - FeatureMacroFusion + FeatureMacroFusion, + FeatureInsertVZEROUPPER ]>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index 8d27be30a277..39d16e7999cd 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -218,9 +218,16 @@ void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo, O << MO.getImm(); return; + case MachineOperand::MO_ConstantPoolIndex: case MachineOperand::MO_GlobalAddress: { - if (IsATT) + switch (MI->getInlineAsmDialect()) { + case InlineAsm::AD_ATT: O << '$'; + break; + case InlineAsm::AD_Intel: + O << "offset "; + break; + } PrintSymbolOperand(MO, O); break; } @@ -336,14 +343,22 @@ void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo, PrintLeaMemReference(MI, OpNo, O, Modifier); } + void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, - unsigned OpNo, raw_ostream &O) { + unsigned OpNo, raw_ostream &O, + const char *Modifier) { const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg); unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm(); const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg); const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp); const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg); + // If we really don't want to print out (rip), don't. + bool HasBaseReg = BaseReg.getReg() != 0; + if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") && + BaseReg.getReg() == X86::RIP) + HasBaseReg = false; + // If this has a segment register, print it. if (SegReg.getReg()) { PrintOperand(MI, OpNo + X86::AddrSegmentReg, O); @@ -353,7 +368,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, O << '['; bool NeedPlus = false; - if (BaseReg.getReg()) { + if (HasBaseReg) { PrintOperand(MI, OpNo + X86::AddrBaseReg, O); NeedPlus = true; } @@ -371,7 +386,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, PrintOperand(MI, OpNo + X86::AddrDisp, O); } else { int64_t DispVal = DispSpec.getImm(); - if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (DispVal || (!IndexReg.getReg() && !HasBaseReg)) { if (NeedPlus) { if (DispVal > 0) O << " + "; @@ -524,11 +539,6 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) { - if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { - PrintIntelMemReference(MI, OpNo, O); - return false; - } - if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. @@ -542,14 +552,26 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, // These only apply to registers, ignore on mem. break; case 'H': - PrintMemReference(MI, OpNo, O, "H"); + if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { + return true; // Unsupported modifier in Intel inline assembly. + } else { + PrintMemReference(MI, OpNo, O, "H"); + } return false; case 'P': // Don't print @PLT, but do print as memory. - PrintMemReference(MI, OpNo, O, "no-rip"); + if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { + PrintIntelMemReference(MI, OpNo, O, "no-rip"); + } else { + PrintMemReference(MI, OpNo, O, "no-rip"); + } return false; } } - PrintMemReference(MI, OpNo, O, nullptr); + if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) { + PrintIntelMemReference(MI, OpNo, O, nullptr); + } else { + PrintMemReference(MI, OpNo, O, nullptr); + } return false; } @@ -614,7 +636,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { Feat00Flags |= 1; } - if (M.getModuleFlag("cfguardtable")) + if (M.getModuleFlag("cfguard")) Feat00Flags |= 0x800; // Object is CFG-aware. OutStreamer->EmitSymbolAttribute(S, MCSA_Global); @@ -727,7 +749,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { //===----------------------------------------------------------------------===// // Force static initialization. -extern "C" void LLVMInitializeX86AsmPrinter() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86AsmPrinter() { RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target()); RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target()); } diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index a011310970b3..ee79401dc80d 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -26,7 +26,7 @@ class MCStreamer; class MCSymbol; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { - const X86Subtarget *Subtarget; + const X86Subtarget *Subtarget = nullptr; StackMaps SM; FaultMaps FM; std::unique_ptr<MCCodeEmitter> CodeEmitter; @@ -60,7 +60,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // to emit any necessary padding-NOPs. void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI); private: - const MachineFunction *MF; + const MachineFunction *MF = nullptr; bool InShadow = false; // RequiredShadowSize holds the length of the shadow specified in the most @@ -112,7 +112,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, const char *Modifier); void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo, - raw_ostream &O); + raw_ostream &O, const char *Modifier); public: X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index 69c6b3356cbb..0f1d4b51062e 100644 --- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -46,6 +46,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" using namespace llvm; @@ -83,13 +84,13 @@ public: } private: - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; - const X86RegisterInfo *TRI; + MachineRegisterInfo *MRI = nullptr; + const X86InstrInfo *TII = nullptr; + const X86RegisterInfo *TRI = nullptr; SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2> BlockedLoadsStoresPairs; SmallVector<MachineInstr *, 2> ForRemoval; - AliasAnalysis *AA; + AliasAnalysis *AA = nullptr; /// Returns couples of Load then Store to memory which look /// like a memcpy. diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index ad7e32b4efc8..f8faa572dffc 100644 --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -115,12 +115,12 @@ private: StringRef getPassName() const override { return "X86 Optimize Call Frame"; } - const X86InstrInfo *TII; - const X86FrameLowering *TFL; - const X86Subtarget *STI; - MachineRegisterInfo *MRI; - unsigned SlotSize; - unsigned Log2SlotSize; + const X86InstrInfo *TII = nullptr; + const X86FrameLowering *TFL = nullptr; + const X86Subtarget *STI = nullptr; + MachineRegisterInfo *MRI = nullptr; + unsigned SlotSize = 0; + unsigned Log2SlotSize = 0; }; } // end anonymous namespace diff --git a/llvm/lib/Target/X86/X86CallLowering.cpp b/llvm/lib/Target/X86/X86CallLowering.cpp index 7ee637cfd523..57bf799cf89c 100644 --- a/llvm/lib/Target/X86/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/X86CallLowering.cpp @@ -115,7 +115,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MIRBuilder.buildConstant(OffsetReg, Offset); Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); + MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); return AddrReg; diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 4c49d68bec99..db1aef2fd09d 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -346,6 +346,10 @@ def RetCC_X86_Win64_C : CallingConv<[ // The X86-Win64 calling convention always returns __m64 values in RAX. CCIfType<[x86mmx], CCBitConvertToType<i64>>, + // GCC returns FP values in RAX on Win64. + CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>, + CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>, + // Otherwise, everything is the same as 'normal' X86-64 C CC. CCDelegateTo<RetCC_X86_64_C> ]>; @@ -434,6 +438,7 @@ def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>, + // CFGuard_Check never returns a value so does not need a RetCC. // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, @@ -606,10 +611,12 @@ def CC_X86_Win64_C : CallingConv<[ // A SwiftError is passed in R12. CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + // The 'CFGuardTarget' parameter, if any, is passed in RAX. + CCIfCFGuardTarget<CCAssignToReg<[RAX]>>, + // 128 bit vectors are passed by pointer CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, - // 256 bit vectors are passed by pointer CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, @@ -622,6 +629,16 @@ def CC_X86_Win64_C : CallingConv<[ // The first 4 MMX vector arguments are passed in GPRs. CCIfType<[x86mmx], CCBitConvertToType<i64>>, + // If SSE was disabled, pass FP values smaller than 64-bits as integers in + // GPRs or on the stack. + CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>, + CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>, + + // The first 4 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64], + CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], + [RCX , RDX , R8 , R9 ]>>, + // The first 4 integer arguments are passed in integer registers. CCIfType<[i8 ], CCAssignToRegWithShadow<[CL , DL , R8B , R9B ], [XMM0, XMM1, XMM2, XMM3]>>, @@ -639,11 +656,6 @@ def CC_X86_Win64_C : CallingConv<[ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], [XMM0, XMM1, XMM2, XMM3]>>, - // The first 4 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], - [RCX , RDX , R8 , R9 ]>>, - // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>> @@ -936,6 +948,12 @@ def CC_X86_32_FastCC : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +def CC_X86_Win32_CFGuard_Check : CallingConv<[ + // The CFGuard check call takes exactly one integer argument + // (i.e. the target function address), which is passed in ECX. + CCIfType<[i32], CCAssignToReg<[ECX]>> +]>; + def CC_X86_32_GHC : CallingConv<[ // Promote i8/i16 arguments to i32. CCIfType<[i8, i16], CCPromoteToType<i32>>, @@ -1000,6 +1018,7 @@ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, + CCIfCC<"CallingConv::CFGuard_Check", CCDelegateTo<CC_X86_Win32_CFGuard_Check>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, @@ -1136,7 +1155,9 @@ def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; // Register calling convention preserves few GPR and XMM8-15 def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>; def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, - (sequence "XMM%u", 4, 7))>; + (sequence "XMM%u", 4, 7))>; +def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>; +def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>; def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, (sequence "R%u", 10, 15))>; def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE, diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp index 5123853f5455..fe43bf4cbbce 100644 --- a/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -61,6 +61,7 @@ #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -111,9 +112,9 @@ public: static char ID; private: - MachineRegisterInfo *MRI; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; TargetSchedModel TSchedModel; /// List of consecutive CMOV instructions. diff --git a/llvm/lib/Target/X86/X86CondBrFolding.cpp b/llvm/lib/Target/X86/X86CondBrFolding.cpp index 1bf2d5ba7b8f..7ede94664bf6 100644 --- a/llvm/lib/Target/X86/X86CondBrFolding.cpp +++ b/llvm/lib/Target/X86/X86CondBrFolding.cpp @@ -115,8 +115,6 @@ private: void optimizeCondBr(MachineBasicBlock &MBB, SmallVectorImpl<MachineBasicBlock *> &BranchPath); - void fixBranchProb(MachineBasicBlock *NextMBB, MachineBasicBlock *RootMBB, - SmallVectorImpl<MachineBasicBlock *> &BranchPath); void replaceBrDest(MachineBasicBlock *MBB, MachineBasicBlock *OrigDest, MachineBasicBlock *NewDest); void fixupModifiedCond(MachineBasicBlock *MBB); diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index b4cf5cafbc6e..438b9fd8eebb 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -373,9 +373,9 @@ public: }; class X86DomainReassignment : public MachineFunctionPass { - const X86Subtarget *STI; - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; + const X86Subtarget *STI = nullptr; + MachineRegisterInfo *MRI = nullptr; + const X86InstrInfo *TII = nullptr; /// All edges that are included in some closure DenseSet<unsigned> EnclosedEdges; diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp index 24c8e6d6f6eb..f1cf9b94c9e5 100755 --- a/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -84,7 +84,7 @@ public: private: /// Machine instruction info used throughout the class. - const X86InstrInfo *TII; + const X86InstrInfo *TII = nullptr; }; } // end anonymous namespace diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 9126a1fbea52..d35d65914b34 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -41,11 +41,11 @@ public: MachineFunctionPass::getAnalysisUsage(AU); } - const X86Subtarget *STI; - const X86InstrInfo *TII; - const X86RegisterInfo *TRI; - const X86MachineFunctionInfo *X86FI; - const X86FrameLowering *X86FL; + const X86Subtarget *STI = nullptr; + const X86InstrInfo *TII = nullptr; + const X86RegisterInfo *TRI = nullptr; + const X86MachineFunctionInfo *X86FI = nullptr; + const X86FrameLowering *X86FL = nullptr; bool runOnMachineFunction(MachineFunction &Fn) override; diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index e5e089d07d55..1dbf40683564 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" @@ -3218,6 +3219,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { case CallingConv::X86_ThisCall: case CallingConv::Win64: case CallingConv::X86_64_SysV: + case CallingConv::CFGuard_Check: break; } diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp index 9f7c4afde760..f8c4a2adb851 100644 --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -48,11 +48,14 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" @@ -113,6 +116,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to // guide some heuristics. + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -127,19 +132,22 @@ public: } private: - MachineFunction *MF; + MachineFunction *MF = nullptr; /// Machine instruction info used throughout the class. - const X86InstrInfo *TII; + const X86InstrInfo *TII = nullptr; /// Local member for function's OptForSize attribute. - bool OptForSize; + bool OptForSize = false; /// Machine loop info used for guiding some heruistics. - MachineLoopInfo *MLI; + MachineLoopInfo *MLI = nullptr; /// Register Liveness information after the current instruction. LivePhysRegs LiveRegs; + + ProfileSummaryInfo *PSI; + MachineBlockFrequencyInfo *MBFI; }; char FixupBWInstPass::ID = 0; } @@ -154,8 +162,11 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { this->MF = &MF; TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); - OptForSize = MF.getFunction().hasOptSize(); MLI = &getAnalysis<MachineLoopInfo>(); + PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : + nullptr; LiveRegs.init(TII->getRegisterInfo()); LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";); @@ -426,6 +437,9 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF, // We run after PEI, so we need to AddPristinesAndCSRs. LiveRegs.addLiveOuts(MBB); + OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp index 543dc8b00fa0..9ac401bb0253 100644 --- a/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -113,8 +113,8 @@ public: private: TargetSchedModel TSM; - const X86InstrInfo *TII; - const X86RegisterInfo *TRI; + const X86InstrInfo *TII = nullptr; + const X86RegisterInfo *TRI = nullptr; }; } @@ -650,6 +650,9 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, .addReg(DestReg) .add(Index); LLVM_DEBUG(NewMI->dump();); + + MBB.erase(I); + I = NewMI; return; } diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp index cbde280aa280..924f429fc138 100644 --- a/llvm/lib/Target/X86/X86FixupSetCC.cpp +++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp @@ -43,19 +43,8 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; private: - // Find the preceding instruction that imp-defs eflags. - MachineInstr *findFlagsImpDef(MachineBasicBlock *MBB, - MachineBasicBlock::reverse_iterator MI); - - // Return true if MI imp-uses eflags. - bool impUsesFlags(MachineInstr *MI); - - // Return true if this is the opcode of a SetCC instruction with a register - // output. - bool isSetCCr(unsigned Opode); - - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; + MachineRegisterInfo *MRI = nullptr; + const X86InstrInfo *TII = nullptr; enum { SearchBound = 16 }; @@ -67,31 +56,6 @@ char X86FixupSetCCPass::ID = 0; FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); } -// We expect the instruction *immediately* before the setcc to imp-def -// EFLAGS (because of scheduling glue). To make this less brittle w.r.t -// scheduling, look backwards until we hit the beginning of the -// basic-block, or a small bound (to avoid quadratic behavior). -MachineInstr * -X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB, - MachineBasicBlock::reverse_iterator MI) { - // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator? - auto MBBStart = MBB->rend(); - for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI) - for (auto &Op : MI->implicit_operands()) - if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isDef()) - return &*MI; - - return nullptr; -} - -bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) { - for (auto &Op : MI->implicit_operands()) - if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isUse()) - return true; - - return false; -} - bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; MRI = &MF.getRegInfo(); @@ -100,7 +64,12 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { SmallVector<MachineInstr*, 4> ToErase; for (auto &MBB : MF) { + MachineInstr *FlagsDefMI = nullptr; for (auto &MI : MBB) { + // Remember the most recent preceding eflags defining instruction. + if (MI.definesRegister(X86::EFLAGS)) + FlagsDefMI = &MI; + // Find a setcc that is used by a zext. // This doesn't have to be the only use, the transformation is safe // regardless. @@ -115,9 +84,6 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { if (!ZExt) continue; - // Find the preceding instruction that imp-defs eflags. - MachineInstr *FlagsDefMI = findFlagsImpDef( - MI.getParent(), MachineBasicBlock::reverse_iterator(&MI)); if (!FlagsDefMI) continue; @@ -126,7 +92,7 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { // it, itself, by definition, clobbers eflags. But it may happen that // FlagsDefMI also *uses* eflags, in which case the transformation is // invalid. - if (impUsesFlags(FlagsDefMI)) + if (FlagsDefMI->readsRegister(X86::EFLAGS)) continue; ++NumSubstZexts; diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index cfba06fb6533..b1d2de29c896 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -87,12 +87,12 @@ public: static char ID; private: - MachineRegisterInfo *MRI; - const X86Subtarget *Subtarget; - const X86InstrInfo *TII; - const TargetRegisterInfo *TRI; - const TargetRegisterClass *PromoteRC; - MachineDominatorTree *MDT; + MachineRegisterInfo *MRI = nullptr; + const X86Subtarget *Subtarget = nullptr; + const X86InstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const TargetRegisterClass *PromoteRC = nullptr; + MachineDominatorTree *MDT = nullptr; CondRegArray collectCondsInRegs(MachineBasicBlock &MBB, MachineBasicBlock::iterator CopyDefI); @@ -115,6 +115,10 @@ private: MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, MachineInstr &CMovI, MachineOperand &FlagUse, CondRegArray &CondRegs); + void rewriteFCMov(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, + MachineInstr &CMovI, MachineOperand &FlagUse, + CondRegArray &CondRegs); void rewriteCondJmp(MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs); @@ -334,6 +338,28 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB, return NewMBB; } +static X86::CondCode getCondFromFCMOV(unsigned Opcode) { + switch (Opcode) { + default: return X86::COND_INVALID; + case X86::CMOVBE_Fp32: case X86::CMOVBE_Fp64: case X86::CMOVBE_Fp80: + return X86::COND_BE; + case X86::CMOVB_Fp32: case X86::CMOVB_Fp64: case X86::CMOVB_Fp80: + return X86::COND_B; + case X86::CMOVE_Fp32: case X86::CMOVE_Fp64: case X86::CMOVE_Fp80: + return X86::COND_E; + case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80: + return X86::COND_A; + case X86::CMOVNB_Fp32: case X86::CMOVNB_Fp64: case X86::CMOVNB_Fp80: + return X86::COND_AE; + case X86::CMOVNE_Fp32: case X86::CMOVNE_Fp64: case X86::CMOVNE_Fp80: + return X86::COND_NE; + case X86::CMOVNP_Fp32: case X86::CMOVNP_Fp64: case X86::CMOVNP_Fp80: + return X86::COND_NP; + case X86::CMOVP_Fp32: case X86::CMOVP_Fp64: case X86::CMOVP_Fp80: + return X86::COND_P; + } +} + bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() << " **********\n"); @@ -593,6 +619,8 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // Otherwise we can just rewrite in-place. if (X86::getCondFromCMov(MI) != X86::COND_INVALID) { rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); + } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) { + rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) { rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs); } else if (MI.getOpcode() == TargetOpcode::COPY) { @@ -674,6 +702,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { } Blocks.push_back(SuccMBB); + + // After this, EFLAGS will be recreated before each use. + SuccMBB->removeLiveIn(X86::EFLAGS); } } while (!Blocks.empty()); @@ -779,10 +810,10 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic( CondRegArray &CondRegs) { // Arithmetic is either reading CF or OF. Figure out which condition we need // to preserve in a register. - X86::CondCode Cond; + X86::CondCode Cond = X86::COND_INVALID; // The addend to use to reset CF or OF when added to the flag value. - int Addend; + int Addend = 0; switch (getMnemonicFromOpcode(MI.getOpcode())) { case FlagArithMnemonic::ADC: @@ -852,6 +883,51 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB, LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump()); } +void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB, + MachineBasicBlock::iterator TestPos, + DebugLoc TestLoc, + MachineInstr &CMovI, + MachineOperand &FlagUse, + CondRegArray &CondRegs) { + // First get the register containing this specific condition. + X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode()); + unsigned CondReg; + bool Inverted; + std::tie(CondReg, Inverted) = + getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs); + + MachineBasicBlock &MBB = *CMovI.getParent(); + + // Insert a direct test of the saved register. + insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg); + + auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode!"); + case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32: + case X86::CMOVB_Fp32: case X86::CMOVNB_Fp32: + case X86::CMOVE_Fp32: case X86::CMOVNE_Fp32: + case X86::CMOVP_Fp32: case X86::CMOVNP_Fp32: + return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32; + case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64: + case X86::CMOVB_Fp64: case X86::CMOVNB_Fp64: + case X86::CMOVE_Fp64: case X86::CMOVNE_Fp64: + case X86::CMOVP_Fp64: case X86::CMOVNP_Fp64: + return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64; + case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80: + case X86::CMOVB_Fp80: case X86::CMOVNB_Fp80: + case X86::CMOVE_Fp80: case X86::CMOVNE_Fp80: + case X86::CMOVP_Fp80: case X86::CMOVNP_Fp80: + return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80; + } + }; + + // Rewrite the CMov to use the !ZF flag from the test. + CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted))); + FlagUse.setIsKill(true); + LLVM_DEBUG(dbgs() << " fixed fcmov: "; CMovI.dump()); +} + void X86FlagsCopyLoweringPass::rewriteCondJmp( MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) { diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp index fcfb5bc91314..13bbd6ccfce4 100644 --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -83,7 +84,7 @@ namespace { StringRef getPassName() const override { return "X86 FP Stackifier"; } private: - const TargetInstrInfo *TII; // Machine instruction info. + const TargetInstrInfo *TII = nullptr; // Machine instruction info. // Two CFG edges are related if they leave the same block, or enter the same // block. The transitive closure of an edge under this relation is a @@ -119,7 +120,7 @@ namespace { SmallVector<LiveBundle, 8> LiveBundles; // The edge bundle analysis provides indices into the LiveBundles vector. - EdgeBundles *Bundles; + EdgeBundles *Bundles = nullptr; // Return a bitmask of FP registers in block's live-in list. static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) { @@ -143,14 +144,14 @@ namespace { // Partition all the CFG edges into LiveBundles. void bundleCFGRecomputeKillFlags(MachineFunction &MF); - MachineBasicBlock *MBB; // Current basic block + MachineBasicBlock *MBB = nullptr; // Current basic block // The hardware keeps track of how many FP registers are live, so we have // to model that exactly. Usually, each live register corresponds to an // FP<n> register, but when dealing with calls, returns, and inline // assembly, it is sometimes necessary to have live scratch registers. unsigned Stack[8]; // FP<n> Registers in each stack slot... - unsigned StackTop; // The current top of the FP stack. + unsigned StackTop = 0; // The current top of the FP stack. enum { NumFPRegs = 8 // Including scratch pseudo-registers. @@ -666,9 +667,12 @@ static const TableEntry OpcodeTable[] = { { X86::CMOVP_Fp32 , X86::CMOVP_F }, { X86::CMOVP_Fp64 , X86::CMOVP_F }, { X86::CMOVP_Fp80 , X86::CMOVP_F }, - { X86::COS_Fp32 , X86::COS_F }, - { X86::COS_Fp64 , X86::COS_F }, - { X86::COS_Fp80 , X86::COS_F }, + { X86::COM_FpIr32 , X86::COM_FIr }, + { X86::COM_FpIr64 , X86::COM_FIr }, + { X86::COM_FpIr80 , X86::COM_FIr }, + { X86::COM_Fpr32 , X86::COM_FST0r }, + { X86::COM_Fpr64 , X86::COM_FST0r }, + { X86::COM_Fpr80 , X86::COM_FST0r }, { X86::DIVR_Fp32m , X86::DIVR_F32m }, { X86::DIVR_Fp64m , X86::DIVR_F64m }, { X86::DIVR_Fp64m32 , X86::DIVR_F32m }, @@ -741,9 +745,6 @@ static const TableEntry OpcodeTable[] = { { X86::MUL_FpI32m32 , X86::MUL_FI32m }, { X86::MUL_FpI32m64 , X86::MUL_FI32m }, { X86::MUL_FpI32m80 , X86::MUL_FI32m }, - { X86::SIN_Fp32 , X86::SIN_F }, - { X86::SIN_Fp64 , X86::SIN_F }, - { X86::SIN_Fp80 , X86::SIN_F }, { X86::SQRT_Fp32 , X86::SQRT_F }, { X86::SQRT_Fp64 , X86::SQRT_F }, { X86::SQRT_Fp80 , X86::SQRT_F }, @@ -803,6 +804,10 @@ static unsigned getConcreteOpcode(unsigned Opcode) { static const TableEntry PopTable[] = { { X86::ADD_FrST0 , X86::ADD_FPrST0 }, + { X86::COMP_FST0r, X86::FCOMPP }, + { X86::COM_FIr , X86::COM_FIPr }, + { X86::COM_FST0r , X86::COMP_FST0r }, + { X86::DIVR_FrST0, X86::DIVR_FPrST0 }, { X86::DIV_FrST0 , X86::DIV_FPrST0 }, @@ -841,7 +846,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { int Opcode = Lookup(PopTable, I->getOpcode()); if (Opcode != -1) { I->setDesc(TII->get(Opcode)); - if (Opcode == X86::UCOM_FPPr) + if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr) I->RemoveOperand(0); } else { // Insert an explicit pop I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0); @@ -971,22 +976,23 @@ void FPS::shuffleStackTop(const unsigned char *FixStack, //===----------------------------------------------------------------------===// void FPS::handleCall(MachineBasicBlock::iterator &I) { + MachineInstr &MI = *I; unsigned STReturns = 0; - const MachineFunction* MF = I->getParent()->getParent(); - for (const auto &MO : I->operands()) { - if (!MO.isReg()) + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI.getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; - unsigned R = MO.getReg() - X86::FP0; + assert(Op.isImplicit() && "Expected implicit def/use"); - if (R < 8) { - if (MF->getFunction().getCallingConv() != CallingConv::X86_RegCall) { - assert(MO.isDef() && MO.isImplicit()); - } + if (Op.isDef()) + STReturns |= 1 << getFPReg(Op); - STReturns |= 1 << R; - } + // Remove the operand so that later passes don't see it. + MI.RemoveOperand(i); + --i; + --e; } unsigned N = countTrailingOnes(STReturns); diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 1b469a814adc..799c1f5d1285 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -92,7 +92,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MFI.hasCopyImplyingStackAdjustment()); } -static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { +static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::SUB64ri8; @@ -104,7 +104,7 @@ static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { } } -static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { +static unsigned getADDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::ADD64ri8; @@ -116,12 +116,12 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { } } -static unsigned getSUBrrOpcode(unsigned isLP64) { - return isLP64 ? X86::SUB64rr : X86::SUB32rr; +static unsigned getSUBrrOpcode(bool IsLP64) { + return IsLP64 ? X86::SUB64rr : X86::SUB32rr; } -static unsigned getADDrrOpcode(unsigned isLP64) { - return isLP64 ? X86::ADD64rr : X86::ADD32rr; +static unsigned getADDrrOpcode(bool IsLP64) { + return IsLP64 ? X86::ADD64rr : X86::ADD32rr; } static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { @@ -135,7 +135,7 @@ static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { return X86::AND32ri; } -static unsigned getLEArOpcode(unsigned IsLP64) { +static unsigned getLEArOpcode(bool IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } @@ -993,8 +993,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, bool NeedsWinFPO = !IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag(); bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO; - bool NeedsDwarfCFI = - !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry()); + bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves(); Register FramePtr = TRI->getFrameRegister(MF); const Register MachineFramePtr = STI.isTarget64BitILP32() @@ -1262,7 +1261,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. - int Alloc = isEAXAlive ? NumBytes - 8 : NumBytes; + int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes; if (isUInt<32>(Alloc)) { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(Alloc) @@ -1614,10 +1613,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, bool HasFP = hasFP(MF); uint64_t NumBytes = 0; - bool NeedsDwarfCFI = - (!MF.getTarget().getTargetTriple().isOSDarwin() && - !MF.getTarget().getTargetTriple().isOSWindows()) && - (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry()); + bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() && + !MF.getTarget().getTargetTriple().isOSWindows()) && + MF.needsFrameMoves(); if (IsFunclet) { assert(HasFP && "EH funclets without FP not yet implemented"); @@ -1862,7 +1860,7 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, return getFrameIndexReference(MF, FI, FrameReg); FrameReg = TRI->getStackRegister(); - return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; + return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; } int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, @@ -2812,11 +2810,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned StackAlign = getStackAlignment(); Amount = alignTo(Amount, StackAlign); - MachineModuleInfo &MMI = MF.getMMI(); const Function &F = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool DwarfCFI = !WindowsCFI && - (MMI.hasDebugInfo() || F.needsUnwindTableEntry()); + bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves(); // If we have any exception handlers in this function, and we adjust // the SP before calls, we may need to indicate this to the unwinder diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 5b546d42d98a..bf33f399db28 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -335,7 +336,7 @@ namespace { // Do not want to hoist if we're not optimizing for size. // TODO: We'd like to remove this restriction. // See the comment in X86InstrInfo.td for more info. - if (!OptForSize) + if (!CurDAG->shouldOptForSize()) return false; // Walk all the users of the immediate. @@ -536,12 +537,17 @@ namespace { // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); - if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || - Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) { + if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM || + Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE || + Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. EVT OpVT = N->getOperand(0).getValueType(); + // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the + // second operand. + if (Opcode == X86ISD::STRICT_CMPM) + OpVT = N->getOperand(1).getValueType(); if (OpVT.is256BitVector() || OpVT.is128BitVector()) return Subtarget->hasVLX(); @@ -575,6 +581,12 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (!N.hasOneUse()) return false; + // FIXME: Temporary hack to prevent strict floating point nodes from + // folding into masked operations illegally. + if (U == Root && Root->getOpcode() == ISD::VSELECT && + N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD) + return false; + if (N.getOpcode() != ISD::LOAD) return true; @@ -804,8 +816,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } switch (N->getOpcode()) { + case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: { + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: { // Replace vector fp_to_s/uint with their X86 specific equivalent so we // don't need 2 sets of patterns. if (!N->getSimpleValueType(0).isVector()) @@ -814,13 +830,24 @@ void X86DAGToDAGISel::PreprocessISelDAG() { unsigned NewOpc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); - case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; - case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; + case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; + case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; + case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; + case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; + case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; + case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; } - SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), - N->getOperand(0)); + SDValue Res; + if (N->isStrictFPOpcode()) + Res = + CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, + {N->getOperand(0), N->getOperand(1)}); + else + Res = + CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), + N->getOperand(0)); --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; CurDAG->DeleteNode(N); continue; @@ -869,27 +896,45 @@ void X86DAGToDAGISel::PreprocessISelDAG() { continue; } case ISD::FCEIL: + case ISD::STRICT_FCEIL: case ISD::FFLOOR: + case ISD::STRICT_FFLOOR: case ISD::FTRUNC: + case ISD::STRICT_FTRUNC: case ISD::FNEARBYINT: - case ISD::FRINT: { + case ISD::STRICT_FNEARBYINT: + case ISD::FRINT: + case ISD::STRICT_FRINT: { // Replace fp rounding with their X86 specific equivalent so we don't // need 2 sets of patterns. unsigned Imm; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); + case ISD::STRICT_FCEIL: case ISD::FCEIL: Imm = 0xA; break; + case ISD::STRICT_FFLOOR: case ISD::FFLOOR: Imm = 0x9; break; + case ISD::STRICT_FTRUNC: case ISD::FTRUNC: Imm = 0xB; break; + case ISD::STRICT_FNEARBYINT: case ISD::FNEARBYINT: Imm = 0xC; break; + case ISD::STRICT_FRINT: case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(N); - SDValue Res = CurDAG->getNode( - X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), - CurDAG->getTargetConstant(Imm, dl, MVT::i8)); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Res; + if (IsStrict) + Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, + {N->getValueType(0), MVT::Other}, + {N->getOperand(0), N->getOperand(1), + CurDAG->getTargetConstant(Imm, dl, MVT::i8)}); + else + Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), + N->getOperand(0), + CurDAG->getTargetConstant(Imm, dl, MVT::i8)); --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; CurDAG->DeleteNode(N); continue; @@ -1017,12 +1062,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. - MVT MemVT; - if (N->getOpcode() == ISD::FP_ROUND) - MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. - else - MemVT = SrcIsSSE ? SrcVT : DstVT; - + MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); SDLoc dl(N); @@ -1075,22 +1115,47 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. - MVT MemVT; - if (N->getOpcode() == ISD::STRICT_FP_ROUND) - MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. - else - MemVT = SrcIsSSE ? SrcVT : DstVT; - + MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? //Since the operation is StrictFP, use the preexisting chain. - SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1), - MemTmp, MachinePointerInfo(), MemVT); - SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, - MachinePointerInfo(), MemVT); + SDValue Store, Result; + if (!SrcIsSSE) { + SDVTList VTs = CurDAG->getVTList(MVT::Other); + SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; + Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, + MachinePointerInfo(), 0, + MachineMemOperand::MOStore); + if (N->getFlags().hasNoFPExcept()) { + SDNodeFlags Flags = Store->getFlags(); + Flags.setNoFPExcept(true); + Store->setFlags(Flags); + } + } else { + assert(SrcVT == MemVT && "Unexpected VT!"); + Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, + MachinePointerInfo()); + } + + if (!DstIsSSE) { + SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); + SDValue Ops[] = {Store, MemTmp}; + Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT, + MachinePointerInfo(), 0, + MachineMemOperand::MOLoad); + if (N->getFlags().hasNoFPExcept()) { + SDNodeFlags Flags = Result->getFlags(); + Flags.setNoFPExcept(true); + Result->setFlags(Flags); + } + } else { + assert(DstVT == MemVT && "Unexpected VT!"); + Result = + CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo()); + } // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because @@ -2224,12 +2289,11 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue(); unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); - // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. - if (AddrSpace == 256) + if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); - if (AddrSpace == 257) + if (AddrSpace == X86AS::FS) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); - if (AddrSpace == 258) + if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); SDLoc DL(N); @@ -3019,7 +3083,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { LLVM_FALLTHROUGH; case X86ISD::ADD: // Try to match inc/dec. - if (!Subtarget->slowIncDec() || OptForSize) { + if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. @@ -4410,6 +4474,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, CNode); return; } + + break; } } @@ -5094,6 +5160,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) { MachineSDNode *NewNode; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) { + if (!LoadN->isSimple()) { + unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits(); + if (MOpc == X86::TEST8mi && NumVolBits != 8) + break; + else if (MOpc == X86::TEST16mi && NumVolBits != 16) + break; + else if (MOpc == X86::TEST32mi && NumVolBits != 32) + break; + } + } SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Reg.getOperand(0) }; NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); @@ -5190,34 +5267,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (foldLoadStoreIntoMemOperand(Node)) return; break; - case ISD::FCEIL: - case ISD::FFLOOR: - case ISD::FTRUNC: - case ISD::FNEARBYINT: - case ISD::FRINT: { - // Replace fp rounding with their X86 specific equivalent so we don't - // need 2 sets of patterns. - // FIXME: This can only happen when the nodes started as STRICT_* and have - // been mutated into their non-STRICT equivalents. Eventually this - // mutation will be removed and we should switch the STRICT_ nodes to a - // strict version of RNDSCALE in PreProcessISelDAG. - unsigned Imm; - switch (Node->getOpcode()) { - default: llvm_unreachable("Unexpected opcode!"); - case ISD::FCEIL: Imm = 0xA; break; - case ISD::FFLOOR: Imm = 0x9; break; - case ISD::FTRUNC: Imm = 0xB; break; - case ISD::FNEARBYINT: Imm = 0xC; break; - case ISD::FRINT: Imm = 0x4; break; - } - SDLoc dl(Node); - SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0), - Node->getOperand(0), - CurDAG->getTargetConstant(Imm, dl, MVT::i8)); - ReplaceNode(Node, Res.getNode()); - SelectCode(Res.getNode()); - return; - } } SelectCode(Node); @@ -5230,10 +5279,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, switch (ConstraintID) { default: llvm_unreachable("Unexpected asm memory constraint"); - case InlineAsm::Constraint_i: - // FIXME: It seems strange that 'i' is needed here since it's supposed to - // be an immediate and not a memory constraint. - LLVM_FALLTHROUGH; case InlineAsm::Constraint_o: // offsetable ?? case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ed975e9248a8..0f152968ddfd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25,7 +25,9 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -154,17 +156,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } - if (Subtarget.isTargetDarwin()) { - // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. - setUseUnderscoreSetJmp(false); - setUseUnderscoreLongJmp(false); - } else if (Subtarget.isTargetWindowsGNU()) { - // MS runtime is weird: it exports _setjmp, but longjmp! - setUseUnderscoreSetJmp(true); - setUseUnderscoreLongJmp(false); - } else { - setUseUnderscoreSetJmp(true); - setUseUnderscoreLongJmp(true); + if (Subtarget.getTargetTriple().isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); } // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to @@ -217,72 +212,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ShiftOp , MVT::i64 , Custom); } - // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this - // operation. - setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); - if (!Subtarget.useSoftFloat()) { - // We have an algorithm for SSE2->double, and we turn this into a - // 64-bit FILD followed by conditional FADD for other targets. - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this + // operation. + setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); - } else { - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand); - } - - // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have - // this operation. - setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); - setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); - - if (!Subtarget.useSoftFloat()) { - // SSE has no i16 to fp conversion, only i32. - if (X86ScalarSSEf32) { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); - } else { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); - } - } else { - setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); - setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand); - } - - // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have - // this operation. - setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); - setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - - if (!Subtarget.useSoftFloat()) { + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); + // We have an algorithm for SSE2->double, and we turn this into a + // 64-bit FILD followed by conditional FADD for other targets. + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); + + // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have + // this operation. + setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); + // SSE has no i16 to fp conversion, only i32. We promote in the handler + // to allow f80 to use i16 and f64 to use i16 with sse1 only + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); + // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); - } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); - } - - // Handle FP_TO_UINT by promoting the destination to a larger signed - // conversion. - setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); - setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); - setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); - - if (!Subtarget.useSoftFloat()) { - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - } + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + + // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have + // this operation. + setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); + // FIXME: This doesn't generate invalid exception when it should. PR44019. + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + + // Handle FP_TO_UINT by promoting the destination to a larger signed + // conversion. + setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); + // FIXME: This doesn't generate invalid exception when it should. PR44019. + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + // FIXME: This doesn't generate invalid exception when it should. PR44019. + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + } + + // Handle address space casts between mixed sized pointers. + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { @@ -409,12 +401,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); - // These should be promoted to a larger select which is supported. - setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) @@ -619,6 +611,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } + // Handle constrained floating-point operations of scalar. + setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); @@ -659,6 +665,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LLROUND, MVT::f80, Expand); setOperationAction(ISD::LRINT, MVT::f80, Expand); setOperationAction(ISD::LLRINT, MVT::f80, Expand); + + // Handle constrained floating-point operations of scalar. + setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten + // as Custom. + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); } // f128 uses xmm registers, but most operations require libcalls. @@ -668,22 +685,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps - setOperationAction(ISD::FADD, MVT::f128, Custom); - setOperationAction(ISD::FSUB, MVT::f128, Custom); - setOperationAction(ISD::FDIV, MVT::f128, Custom); - setOperationAction(ISD::FMUL, MVT::f128, Custom); - setOperationAction(ISD::FMA, MVT::f128, Expand); + setOperationAction(ISD::FADD, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); + setOperationAction(ISD::FDIV, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); + setOperationAction(ISD::FMUL, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); + setOperationAction(ISD::FMA, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); setOperationAction(ISD::FABS, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); - setOperationAction(ISD::FSIN, MVT::f128, Expand); - setOperationAction(ISD::FCOS, MVT::f128, Expand); - setOperationAction(ISD::FSINCOS, MVT::f128, Expand); - setOperationAction(ISD::FSQRT, MVT::f128, Expand); - - setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + setOperationAction(ISD::FSIN, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); + setOperationAction(ISD::FCOS, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); + setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); + // No STRICT_FSINCOS + setOperationAction(ISD::FSQRT, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); + + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); // We need to custom handle any FP_ROUND with an f128 input, but // LegalizeDAG uses the result type to know when to run a custom handler. // So we have to list all legal floating point result types here. @@ -820,12 +847,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom); + setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -895,6 +925,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); @@ -933,37 +965,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); - - // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into - // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is - // split again based on the input type, this will cause an AssertSExt i16 to - // be emitted instead of an AssertZExt. This will allow packssdw followed by - // packuswb to be used to truncate to v8i8. This is necessary since packusdw - // isn't available until sse4.1. - setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); + } setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); + + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. + setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for @@ -1008,6 +1041,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // With AVX512, expanding (and promoting the shifts) is better. if (!Subtarget.hasAVX512()) setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + + setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -1029,11 +1068,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { - setOperationAction(ISD::FFLOOR, RoundedTy, Legal); - setOperationAction(ISD::FCEIL, RoundedTy, Legal); - setOperationAction(ISD::FTRUNC, RoundedTy, Legal); - setOperationAction(ISD::FRINT, RoundedTy, Legal); - setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); + setOperationAction(ISD::FCEIL, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); + setOperationAction(ISD::FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); + setOperationAction(ISD::FRINT, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); + setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); @@ -1072,6 +1116,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // i8 vectors are custom because the source register and source // source memory operand types are not the same width. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + + if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { + // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can + // do the pre and post work in the vector domain. + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); + // We need to mark SINT_TO_FP as Custom even though we want to expand it + // so that DAG combine doesn't try to turn it into uint_to_fp. + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { @@ -1105,25 +1160,45 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); - setOperationAction(ISD::FNEG, VT, Custom); - setOperationAction(ISD::FABS, VT, Custom); - setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); } // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); + + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); @@ -1169,6 +1244,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); @@ -1180,8 +1257,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, - MVT::v2f64, MVT::v4f64 }) + MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); + } } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1233,6 +1312,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { @@ -1299,12 +1379,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1331,6 +1417,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::UADDSAT, VT, Custom); @@ -1372,21 +1460,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); + setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); } - setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); - - setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom); + for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) { + setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); + } + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal); + + setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); @@ -1420,11 +1524,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::FFLOOR, VT, Legal); - setOperationAction(ISD::FCEIL, VT, Legal); - setOperationAction(ISD::FTRUNC, VT, Legal); - setOperationAction(ISD::FRINT, VT, Legal); - setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::FFLOOR, VT, Legal); + setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); + setOperationAction(ISD::FCEIL, VT, Legal); + setOperationAction(ISD::STRICT_FCEIL, VT, Legal); + setOperationAction(ISD::FTRUNC, VT, Legal); + setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); + setOperationAction(ISD::FRINT, VT, Legal); + setOperationAction(ISD::STRICT_FRINT, VT, Legal); + setOperationAction(ISD::FNEARBYINT, VT, Legal); + setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); } @@ -1459,6 +1568,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCC, VT, Custom); + setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use @@ -1470,8 +1581,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } @@ -1532,13 +1647,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { // These operations are handled on non-VLX by artificially widening in // isel patterns. - // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, + Subtarget.hasVLX() ? Legal : Custom); for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); @@ -1563,12 +1690,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::SINT_TO_FP, VT, Legal); - setOperationAction(ISD::UINT_TO_FP, VT, Legal); - setOperationAction(ISD::FP_TO_SINT, VT, Legal); - setOperationAction(ISD::FP_TO_UINT, VT, Legal); - - setOperationAction(ISD::MUL, VT, Legal); + setOperationAction(ISD::SINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::UINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FP_TO_SINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::FP_TO_UINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, + Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::MUL, VT, Legal); } } @@ -1739,12 +1877,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. - setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && + isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } if (Subtarget.hasBWI()) { @@ -1828,8 +1968,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) for (ISD::NodeType Op : - {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG, - ISD::FLOG10, ISD::FPOW, ISD::FSIN}) + {ISD::FCEIL, ISD::STRICT_FCEIL, + ISD::FCOS, ISD::STRICT_FCOS, + ISD::FEXP, ISD::STRICT_FEXP, + ISD::FFLOOR, ISD::STRICT_FFLOOR, + ISD::FREM, ISD::STRICT_FREM, + ISD::FLOG, ISD::STRICT_FLOG, + ISD::FLOG10, ISD::STRICT_FLOG10, + ISD::FPOW, ISD::STRICT_FPOW, + ISD::FSIN, ISD::STRICT_FSIN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); @@ -1870,6 +2017,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::STRICT_SINT_TO_FP); + setTargetDAGCombine(ISD::STRICT_UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1901,6 +2050,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); + + // Default to having -disable-strictnode-mutation on + IsStrictFPEnabled = true; } // This has so far only been implemented for 64-bit MachO. @@ -1910,7 +2062,7 @@ bool X86TargetLowering::useLoadStackGuardNode() const { bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. - return Subtarget.getTargetTriple().isOSMSVCRT(); + return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, @@ -1946,9 +2098,13 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return MVT::i8; + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) + return MVT::v32i1; // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && - Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && + Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return MVT::v16i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -1966,9 +2122,13 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) return VT.getVectorNumElements(); + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) + return 2; // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && - Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && + Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -1988,6 +2148,15 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } + // Split v64i1 vectors if we don't have v64i8 available. + if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + CC != CallingConv::X86_RegCall) { + RegisterVT = MVT::v32i1; + IntermediateVT = MVT::v32i1; + NumIntermediates = 2; + return 2; + } + return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } @@ -2383,6 +2552,10 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); + const TargetMachine &TM = getTargetMachine(); + if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS)) + return false; + return SrcAS < 256 && DestAS < 256; } @@ -2520,18 +2693,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); - // If this is x86-64, and we disabled SSE, we can't return FP values, - // or SSE or MMX vectors. - if ((ValVT == MVT::f32 || ValVT == MVT::f64 || - VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { + // Report an error if we have attempted to return a value via an XMM + // register and SSE was disabled. + if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } else if (ValVT == MVT::f64 && - (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { - // Likewise we can't return F64 values with SSE1 only. gcc does so, but - // llvm-gcc has never done it right and no one has noticed, so this - // should be OK for now. + } else if (!Subtarget.hasSSE2() && + X86::FR64XRegClass.contains(VA.getLocReg()) && + ValVT == MVT::f64) { + // When returning a double via an XMM register, report an error if SSE2 is + // not enabled. errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } @@ -2826,7 +2997,6 @@ SDValue X86TargetLowering::LowerCallResult( const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; - bool Is64Bit = Subtarget.is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); @@ -2845,15 +3015,22 @@ SDValue X86TargetLowering::LowerCallResult( RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); } - // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && - ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { + // Report an error if there was an attempt to return FP values via XMM + // registers. + if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. - } else if (CopyVT == MVT::f64 && - (Is64Bit && !Subtarget.hasSSE2())) { + if (VA.getLocReg() == X86::XMM1) + VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. + else + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (!Subtarget.hasSSE2() && + X86::FR64XRegClass.contains(VA.getLocReg()) && + CopyVT == MVT::f64) { errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); - VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + if (VA.getLocReg() == X86::XMM1) + VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. + else + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -2895,6 +3072,9 @@ SDValue X86TargetLowering::LowerCallResult( Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); } + if (VA.getLocInfo() == CCValAssign::BCvt) + Val = DAG.getBitcast(VA.getValVT(), Val); + InVals.push_back(Val); } @@ -2993,9 +3173,7 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { - auto Attr = - CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); - if (!CI->isTailCall() || Attr.getValueAsString() == "true") + if (!CI->isTailCall()) return false; ImmutableCallSite CS(CI); @@ -3464,8 +3642,8 @@ SDValue X86TargetLowering::LowerFormalArguments( FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); - // Conservatively forward AL on x86_64, since it might be used for varargs. - if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { + // Forward AL for SysV x86_64 targets, since it is used for varargs. + if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } @@ -3618,7 +3796,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); - auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || @@ -3634,9 +3811,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); - if (Attr.getValueAsString() == "true") - isTailCall = false; - if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT @@ -3728,7 +3902,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, "the only memory argument"); } - if (!IsSibcall) + if (!IsSibcall && !IsMustTail) Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, NumBytes - NumBytesToPush, dl); @@ -4013,7 +4187,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector<SDValue, 8> Ops; - if (!IsSibcall && isTailCall) { + if (!IsSibcall && isTailCall && !IsMustTail) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); @@ -4183,23 +4357,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned -X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, - SelectionDAG& DAG) const { - const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - unsigned StackAlignment = TFI.getStackAlignment(); - uint64_t AlignMask = StackAlignment - 1; - int64_t Offset = StackSize; - unsigned SlotSize = RegInfo->getSlotSize(); - if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { - // Number smaller than 12 so just add the difference. - Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); - } else { - // Mask out lower bits, add stackalignment once plus the 12 bytes. - Offset = ((~AlignMask) & Offset) + StackAlignment + - (StackAlignment-SlotSize); - } - return Offset; +X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, + SelectionDAG &DAG) const { + const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment()); + const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); + assert(StackSize % SlotSize == 0 && + "StackSize must be a multiple of SlotSize"); + return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; } /// Return true if the given stack call argument is already available in the @@ -4643,8 +4807,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv, } } -/// Return true if the condition is an unsigned comparison operation. -static bool isX86CCUnsigned(unsigned X86CC) { +/// Return true if the condition is an signed comparison operation. +static bool isX86CCSigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); @@ -4654,12 +4818,12 @@ static bool isX86CCUnsigned(unsigned X86CC) { case X86::COND_A: case X86::COND_BE: case X86::COND_AE: - return true; + return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: - return false; + return true; } } @@ -4700,7 +4864,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, // X >= 0 -> X == 0, jump on !sign. return X86::COND_NS; } - if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) { + if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; @@ -4949,12 +5113,6 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } -bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, - bool IsSigned) const { - // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available. - return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov(); -} - bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) @@ -5334,15 +5492,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, static bool canWidenShuffleElements(ArrayRef<int> Mask, const APInt &Zeroable, + bool V2IsZero, SmallVectorImpl<int> &WidenedMask) { - SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end()); - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - if (TargetMask[i] == SM_SentinelUndef) - continue; - if (Zeroable[i]) - TargetMask[i] = SM_SentinelZero; + // Create an alternative mask with info about zeroable elements. + // Here we do not set undef elements as zeroable. + SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end()); + if (V2IsZero) { + assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); + for (int i = 0, Size = Mask.size(); i != Size; ++i) + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + ZeroableMask[i] = SM_SentinelZero; } - return canWidenShuffleElements(TargetMask, WidenedMask); + return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef<int> Mask) { @@ -5764,11 +5925,29 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + + // Do an optimization for the the most frequently used types. + if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { + APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); + Mask0.flipAllBits(); + SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); + SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); + Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); + + // Reduce to original width if needed. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + } + + // Clear the upper bits of the subvector and move it to its insert position. SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); @@ -5850,7 +6029,7 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, "Expected VTs to be the same size!"); unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, - std::max(128U, VT.getSizeInBits() / Scale)); + std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); InVT = In.getValueType(); } @@ -6719,9 +6898,97 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return true; } +/// Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static void computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2, + APInt &KnownUndef, APInt &KnownZero) { + int Size = Mask.size(); + KnownUndef = KnownZero = APInt::getNullValue(Size); + + V1 = peekThroughBitcasts(V1); + V2 = peekThroughBitcasts(V2); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + int VectorSizeInBits = V1.getValueSizeInBits(); + int ScalarSizeInBits = VectorSizeInBits / Size; + assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); + + for (int i = 0; i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0) { + KnownUndef.setBit(i); + continue; + } + if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + KnownZero.setBit(i); + continue; + } + + // Determine shuffle input and normalize the mask. + SDValue V = M < Size ? V1 : V2; + M %= Size; + + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + // If the BUILD_VECTOR has fewer elements then the bitcasted portion of + // the (larger) source element must be UNDEF/ZERO. + if ((Size % V.getNumOperands()) == 0) { + int Scale = Size / V->getNumOperands(); + SDValue Op = V.getOperand(M / Scale); + if (Op.isUndef()) + KnownUndef.setBit(i); + if (X86::isZeroNode(Op)) + KnownZero.setBit(i); + else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { + APInt Val = Cst->getAPIntValue(); + Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); + if (Val == 0) + KnownZero.setBit(i); + } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { + APInt Val = Cst->getValueAPF().bitcastToAPInt(); + Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); + if (Val == 0) + KnownZero.setBit(i); + } + continue; + } + + // If the BUILD_VECTOR has more elements then all the (smaller) source + // elements must be UNDEF or ZERO. + if ((V.getNumOperands() % Size) == 0) { + int Scale = V->getNumOperands() / Size; + bool AllUndef = true; + bool AllZero = true; + for (int j = 0; j < Scale; ++j) { + SDValue Op = V.getOperand((M * Scale) + j); + AllUndef &= Op.isUndef(); + AllZero &= X86::isZeroNode(Op); + } + if (AllUndef) + KnownUndef.setBit(i); + if (AllZero) + KnownZero.setBit(i); + continue; + } + } +} + /// Decode a target shuffle mask and inputs and see if any values are /// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. +/// FIXME: Merge this with computeZeroableShuffleElements? static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, APInt &KnownUndef, APInt &KnownZero) { @@ -6741,7 +7008,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); - assert((VT.getSizeInBits() % Mask.size()) == 0 && + assert((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Size; @@ -6810,7 +7077,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, // Replace target shuffle mask elements with known undef/zero sentinels. static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, const APInt &KnownUndef, - const APInt &KnownZero) { + const APInt &KnownZero, + bool ResolveKnownZeros= true) { unsigned NumElts = Mask.size(); assert(KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); @@ -6818,7 +7086,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, for (unsigned i = 0; i != NumElts; ++i) { if (KnownUndef[i]) Mask[i] = SM_SentinelUndef; - else if (KnownZero[i]) + else if (ResolveKnownZeros && KnownZero[i]) Mask[i] = SM_SentinelZero; } } @@ -8306,7 +8574,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -8552,7 +8820,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, ImmH = DAG.getBitcast(MVT::v32i1, ImmH); DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); } else { - MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U)); + MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; DstVec = DAG.getBitcast(VecVT, Imm); @@ -10130,13 +10398,18 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) { return true; } -/// Test whether there are elements crossing 128-bit lanes in this +/// Test whether there are elements crossing LaneSizeInBits lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. -static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); +static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, + unsigned ScalarSizeInBits, + ArrayRef<int> Mask) { + assert(LaneSizeInBits && ScalarSizeInBits && + (LaneSizeInBits % ScalarSizeInBits) == 0 && + "Illegal shuffle lane size"); + int LaneSize = LaneSizeInBits / ScalarSizeInBits; int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) @@ -10144,6 +10417,12 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { return false; } +/// Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { + return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); +} + /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same @@ -10424,84 +10703,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } -/// Compute whether each element of a shuffle is zeroable. -/// -/// A "zeroable" vector shuffle element is one which can be lowered to zero. -/// Either it is an undef element in the shuffle mask, the element of the input -/// referenced is undef, or the element of the input referenced is known to be -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle -/// as many lanes with this technique as possible to simplify the remaining -/// shuffle. -static APInt computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - APInt Zeroable(Mask.size(), 0); - V1 = peekThroughBitcasts(V1); - V2 = peekThroughBitcasts(V2); - - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - int VectorSizeInBits = V1.getValueSizeInBits(); - int ScalarSizeInBits = VectorSizeInBits / Mask.size(); - assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); - - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable.setBit(i); - continue; - } - - // Determine shuffle input and normalize the mask. - SDValue V = M < Size ? V1 : V2; - M %= Size; - - // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. - if (V.getOpcode() != ISD::BUILD_VECTOR) - continue; - - // If the BUILD_VECTOR has fewer elements then the bitcasted portion of - // the (larger) source element must be UNDEF/ZERO. - if ((Size % V.getNumOperands()) == 0) { - int Scale = Size / V->getNumOperands(); - SDValue Op = V.getOperand(M / Scale); - if (Op.isUndef() || X86::isZeroNode(Op)) - Zeroable.setBit(i); - else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { - APInt Val = Cst->getAPIntValue(); - Val.lshrInPlace((M % Scale) * ScalarSizeInBits); - Val = Val.getLoBits(ScalarSizeInBits); - if (Val == 0) - Zeroable.setBit(i); - } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { - APInt Val = Cst->getValueAPF().bitcastToAPInt(); - Val.lshrInPlace((M % Scale) * ScalarSizeInBits); - Val = Val.getLoBits(ScalarSizeInBits); - if (Val == 0) - Zeroable.setBit(i); - } - continue; - } - - // If the BUILD_VECTOR has more elements then all the (smaller) source - // elements must be UNDEF or ZERO. - if ((V.getNumOperands() % Size) == 0) { - int Scale = V->getNumOperands() / Size; - bool AllZeroable = true; - for (int j = 0; j < Scale; ++j) { - SDValue Op = V.getOperand((M * Scale) + j); - AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); - } - if (AllZeroable) - Zeroable.setBit(i); - continue; - } - } - - return Zeroable; -} - // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. @@ -10616,11 +10817,11 @@ static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } -static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, - unsigned &UnpackOpcode, bool IsUnary, - ArrayRef<int> TargetMask, - const SDLoc &DL, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, + unsigned &UnpackOpcode, bool IsUnary, + ArrayRef<int> TargetMask, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; @@ -10728,8 +10929,8 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, return SDValue(); } -static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, - int Delta) { +static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, + int Delta) { int Size = (int)Mask.size(); int Split = Size / Delta; int TruncatedVectorStart = SwappedOps ? Size : 0; @@ -10814,8 +11015,8 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, // The first half/quarter of the mask should refer to every second/fourth // element of the vector truncated and bitcasted. - if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) && - !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4)) + if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) && + !matchShuffleAsVPMOV(Mask, SwappedOps, 4)) return SDValue(); return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); @@ -10823,11 +11024,10 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. -static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, - SDValue &V2, unsigned &PackOpcode, - ArrayRef<int> TargetMask, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, + unsigned &PackOpcode, ArrayRef<int> TargetMask, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); MVT PackSVT = MVT::getIntegerVT(BitSize * 2); @@ -10880,8 +11080,8 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; - if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, - Subtarget)) + if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, + Subtarget)) return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), DAG.getBitcast(PackVT, V2)); @@ -10972,10 +11172,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG); -static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, - MutableArrayRef<int> Mask, - const APInt &Zeroable, bool &ForceV1Zero, - bool &ForceV2Zero, uint64_t &BlendMask) { +static bool matchShuffleAsBlend(SDValue V1, SDValue V2, + MutableArrayRef<int> Mask, + const APInt &Zeroable, bool &ForceV1Zero, + bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = @@ -11038,8 +11238,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector<int, 64> Mask(Original.begin(), Original.end()); - if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, - BlendMask)) + if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, + BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. @@ -11161,7 +11361,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -11609,9 +11809,11 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, } /// Try to lower a vector shuffle as a byte shift sequence. -static SDValue lowerVectorShuffleAsByteShiftMask( - const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { +static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(VT.is128BitVector() && "Only 128-bit vectors supported"); @@ -14056,8 +14258,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return BitBlend; // Try to use byte shift instructions to mask. - if (SDValue V = lowerVectorShuffleAsByteShiftMask( - DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; // Try to lower by permuting the inputs into an unpack instruction. @@ -14318,8 +14520,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; // Try to use byte shift instructions to mask. - if (SDValue V = lowerVectorShuffleAsByteShiftMask( - DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly @@ -14686,6 +14888,36 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, DAG); } +// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). +// TODO: Extend to support v8f32 (+ 512-bit shuffles). +static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT == MVT::v4f64 && "Only for v4f64 shuffles"); + + int LHSMask[4] = {-1, -1, -1, -1}; + int RHSMask[4] = {-1, -1, -1, -1}; + unsigned SHUFPMask = 0; + + // As SHUFPD uses a single LHS/RHS element per lane, we can always + // perform the shuffle once the lanes have been shuffled in place. + for (int i = 0; i != 4; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + int LaneBase = i & ~1; + auto &LaneMask = (i & 1) ? RHSMask : LHSMask; + LaneMask[LaneBase + (M & 1)] = M; + SHUFPMask |= (M & 1) << i; + } + + SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask); + SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask); + return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS, + DAG.getTargetConstant(SHUFPMask, DL, MVT::i8)); +} + /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a lane permutation followed by a per-lane permutation. /// @@ -14764,13 +14996,22 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( int Size = Mask.size(); int LaneSize = Size / 2; + // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). + // Only do this if the elements aren't all from the lower lane, + // otherwise we're (probably) better off doing a split. + if (VT == MVT::v4f64 && + !all_of(Mask, [LaneSize](int M) { return M < LaneSize; })) + if (SDValue V = + lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG)) + return V; + // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. if (!Subtarget.hasAVX2()) { bool LaneCrossing[2] = {false, false}; for (int i = 0; i < Size; ++i) - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize)) LaneCrossing[(Mask[i] % Size) / LaneSize] = true; if (!LaneCrossing[0] || !LaneCrossing[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); @@ -14778,7 +15019,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle( bool LaneUsed[2] = {false, false}; for (int i = 0; i < Size; ++i) if (Mask[i] >= 0) - LaneUsed[(Mask[i] / LaneSize)] = true; + LaneUsed[(Mask[i] % Size) / LaneSize] = true; if (!LaneUsed[0] || !LaneUsed[1]) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } @@ -14817,8 +15058,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); + bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 4> WidenedMask; - if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask)) + if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; @@ -15637,6 +15880,18 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return Op; + // If we have lane crossing shuffles AND they don't all come from the lower + // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). + // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently + // canonicalize to a blend of splat which isn't necessary for this combine. + if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && + !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && + (V1.getOpcode() != ISD::BUILD_VECTOR) && + (V2.getOpcode() != ISD::BUILD_VECTOR)) + if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, + Mask, DAG)) + return Op; + // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) @@ -16950,6 +17205,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; break; case MVT::v64i1: + // Fall back to scalarization. FIXME: We can do better if the shuffle + // can be partitioned cleanly. + if (!Subtarget.useBWIRegs()) + return SDValue(); ExtVT = MVT::v64i8; break; } @@ -17039,8 +17298,8 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. -static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); @@ -17086,29 +17345,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2); + APInt KnownUndef, KnownZero; + computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero); + + APInt Zeroable = KnownUndef | KnownZero; if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); - // Create an alternative mask with info about zeroable elements. - // Here we do not set undef elements as zeroable. - SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end()); - if (V2IsZero) { - assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); - for (int i = 0; i != NumElements; ++i) - if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) - ZeroableMask[i] = SM_SentinelZero; - } - // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - canWidenShuffleElements(ZeroableMask, WidenedMask)) { + canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this @@ -18307,7 +18559,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize && Subtarget.isSHLDSlow()) return SDValue(); @@ -18328,8 +18580,13 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert((Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"); - SDValue Src = Op.getOperand(0); + Op.getOpcode() == ISD::STRICT_SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_UINT_TO_FP || + Op.getOpcode() == ISD::UINT_TO_FP) && + "Unexpected opcode!"); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); @@ -18346,7 +18603,17 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); + if (IsStrict) { + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, + {Op.getOperand(0), InVec}); + SDValue Chain = CvtVec.getValue(1); + SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Value, Chain}, dl); + } + SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, DAG.getIntPtrConstant(0, dl)); } @@ -18415,44 +18682,157 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, DAG.getIntPtrConstant(0, DL)); } +static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDLoc DL(Op); + bool IsStrict = Op->isStrictFPOpcode(); + MVT VT = Op->getSimpleValueType(0); + SDValue Src = Op->getOperand(IsStrict ? 1 : 0); + + if (Subtarget.hasDQI()) { + assert(!Subtarget.hasVLX() && "Unexpected features"); + + assert((Src.getSimpleValueType() == MVT::v2i64 || + Src.getSimpleValueType() == MVT::v4i64) && + "Unsupported custom type"); + + // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. + assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && + "Unexpected VT!"); + MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; + + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) + : DAG.getUNDEF(MVT::v8i64); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, + DAG.getIntPtrConstant(0, DL)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; + } + + bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP || + Op->getOpcode() == ISD::STRICT_SINT_TO_FP; + if (VT != MVT::v4f32 || IsSigned) + return SDValue(); + + SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); + SDValue One = DAG.getConstant(1, DL, MVT::v4i64); + SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, + DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), + DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); + SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src); + SmallVector<SDValue, 4> SignCvts(4); + SmallVector<SDValue, 4> Chains(4); + for (int i = 0; i != 4; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, + DAG.getIntPtrConstant(i, DL)); + if (IsStrict) { + SignCvts[i] = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Src}); + Chains[i] = SignCvts[i].getValue(1); + } else { + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src); + } + } + SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); + + SDValue Slow, Chain; + if (IsStrict) { + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt); + } + + IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg); + SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt); + + if (IsStrict) + return DAG.getMergeValues({Cvt, Chain}, DL); + + return Cvt; +} + SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); + SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); MVT SrcVT = Src.getSimpleValueType(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - if (VT == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); - if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { + // Note: Since v2f64 is a legal type. We don't need to zero extend the + // source for strict FP. + if (IsStrict) + return DAG.getNode( + X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, + {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getUNDEF(SrcVT))}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, DAG.getUNDEF(SrcVT))); } + if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64) + return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); + return SDValue(); } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); + bool UseSSEReg = isScalarFPTypeInSSEReg(VT); + // These are really Legal; return the operand so the caller accepts it as // Legal. - if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT)) + if (SrcVT == MVT::i32 && UseSSEReg) return Op; - if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) + if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit()) return Op; if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; - SDValue ValueToStore = Op.getOperand(0); - if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && - !Subtarget.is64Bit()) + // SSE doesn't have an i16 conversion so we need to promote. + if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {Chain, Ext}); + + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); + } + + if (VT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); + + SDValue ValueToStore = Src; + if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. @@ -18463,13 +18843,18 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, ValueToStore, StackSlot, + Chain = DAG.getStore( + Chain, dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); - return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); + std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); + + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + + return Tmp.first; } -SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, +std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const { // Build the FILD @@ -18498,9 +18883,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, Tys, FILDOps, SrcVT, LoadMMO); + Chain = Result.getValue(1); if (useSSE) { - Chain = Result.getValue(1); SDValue InFlag = Result.getValue(2); // FIXME: Currently the FST is glued to the FILD_FLAG. This @@ -18522,9 +18907,10 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, Result = DAG.getLoad( Op.getValueType(), DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); + Chain = Result.getValue(1); } - return Result; + return { Result, Chain }; } /// Horizontal vector math instructions may be slower than normal math with @@ -18532,7 +18918,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, /// implementation, and likely shuffle complexity of the alternate sequence. static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool IsOptimizingSize = DAG.shouldOptForSize(); bool HasFastHOps = Subtarget.hasFastHorizontalOps(); return !IsSingleSource || IsOptimizingSize || HasFastHOps; } @@ -18553,6 +18939,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, #endif */ + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; SDLoc dl(Op); LLVMContext *Context = DAG.getContext(); @@ -18573,8 +18961,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); // Load the 64-bit value into an XMM register. - SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Op.getOperand(0)); + SDValue XR1 = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo)); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -18587,51 +18975,81 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), /* Alignment = */ 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + SDValue Sub; + SDValue Chain; // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + if (IsStrict) { + Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), XR2F, CLod1}); + Chain = Sub.getValue(1); + } else + Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { + if (!IsStrict && Subtarget.hasSSE3() && + shouldUseHorizontalOp(true, DAG, Subtarget)) { + // FIXME: Do we need a STRICT version of FHADD? Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); - Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); + if (IsStrict) { + Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other}, + {Chain, Shuffle, Sub}); + Chain = Result.getValue(1); + } else + Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); } + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Result, Chain}, dl); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, - DAG.getIntPtrConstant(0, dl)); + return Result; } /// 32-bit unsigned integer to float expansion. static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; SDLoc dl(Op); // FP constant to bias correct the final result. SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::f64); // Load the 32-bit value into an XMM register. - SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, - Op.getOperand(0)); + SDValue Load = + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); // Zero out the upper parts of the register. Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); - Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getBitcast(MVT::v2f64, Load), - DAG.getIntPtrConstant(0, dl)); - // Or the load with the bias. SDValue Or = DAG.getNode( ISD::OR, dl, MVT::v2i64, - DAG.getBitcast(MVT::v2i64, - DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), + DAG.getBitcast(MVT::v2i64, Load), DAG.getBitcast(MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); + if (Op.getNode()->isStrictFPOpcode()) { + // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? + SDValue Chain = Op.getOperand(0); + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, + {Chain, Or, Bias}); + + if (Op.getValueType() == Sub.getValueType()) + return Sub; + + // Handle final rounding. + std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound( + Sub, Sub.getValue(1), dl, Op.getSimpleValueType()); + + return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl); + } + // Subtract the bias. // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); @@ -18646,38 +19064,123 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, if (Op.getSimpleValueType() != MVT::v2f64) return SDValue(); - SDValue N0 = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + + SDValue N0 = Op.getOperand(IsStrict ? 1 : 0); assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"); - // Legalize to v4i32 type. - N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, - DAG.getUNDEF(MVT::v2i32)); + if (Subtarget.hasAVX512()) { + if (!Subtarget.hasVLX()) { + // Let generic type legalization widen this. + if (!IsStrict) + return SDValue(); + // Otherwise pad the integer input with 0s and widen the operation. + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getConstant(0, DL, MVT::v2i32)); + SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other}, + {Op.getOperand(0), N0}); + SDValue Chain = Res.getValue(1); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res, + DAG.getIntPtrConstant(0, DL)); + return DAG.getMergeValues({Res, Chain}, DL); + } - if (Subtarget.hasAVX512()) + // Legalize to v4i32 type. + N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), N0}); return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); + } - // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT, - // but using v2i32 to v2f64 with X86ISD::CVTSI2P. - SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32); - SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32); - - // Two to the power of half-word-size. - SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64); - - // Clear upper part of LO, lower HI. - SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord); - SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask); - - SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI); - fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW); - SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO); + // Zero extend to 2i64, OR with the floating point representation of 2^52. + // This gives us the floating point equivalent of 2^52 + the i32 integer + // since double has 52-bits of mantissa. Then subtract 2^52 in floating + // point leaving just our i32 integers in double format. + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0); + SDValue VBias = + DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn, + DAG.getBitcast(MVT::v2i64, VBias)); + Or = DAG.getBitcast(MVT::v2f64, Or); - // Add the two halves. - return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO); + if (IsStrict) + return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other}, + {Op.getOperand(0), Or, VBias}); + return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias); } static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + SDLoc DL(Op); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue V = Op->getOperand(IsStrict ? 1 : 0); + MVT VecIntVT = V.getSimpleValueType(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + + if (Subtarget.hasAVX512()) { + // With AVX512, but not VLX we need to widen to get a 512-bit result type. + assert(!Subtarget.hasVLX() && "Unexpected features"); + MVT VT = Op->getSimpleValueType(0); + + // v8i32->v8f64 is legal with AVX512 so just return it. + if (VT == MVT::v8f64) + return Op; + + assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && + "Unexpected VT!"); + MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; + MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + SDValue Tmp = + IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT); + V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V, + DAG.getIntPtrConstant(0, DL)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other}, + {Op->getOperand(0), V}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; + } + + if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 && + Op->getSimpleValueType(0) == MVT::v4f64) { + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V); + Constant *Bias = ConstantFP::get( + *DAG.getContext(), + APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL))); + auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8); + SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; + SDValue VBias = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*Alignment*/ 8, MachineMemOperand::MOLoad); + + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, + DAG.getBitcast(MVT::v4i64, VBias)); + Or = DAG.getBitcast(MVT::v4f64, Or); + + if (IsStrict) + return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other}, + {Op.getOperand(0), Or, VBias}); + return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias); + } + // The algorithm is the following: // #ifdef __SSE4_1__ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); @@ -18690,18 +19193,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; - // We shouldn't use it when unsafe-fp-math is enabled though: we might later - // reassociate the two FADDs, and if we do that, the algorithm fails - // spectacularly (PR24512). - // FIXME: If we ever have some kind of Machine FMF, this should be marked - // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because - // there's also the MachineCombiner reassociations happening on Machine IR. - if (DAG.getTarget().Options.UnsafeFPMath) - return SDValue(); - - SDLoc DL(Op); - SDValue V = Op->getOperand(0); - MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, @@ -18709,9 +19200,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); - assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && - "Unsupported custom type"); - // In the #idef/#else code, we have in common: // - The vector of constants: // -- 0x4b000000 @@ -18756,23 +19244,35 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); } - // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). - SDValue VecCstFAdd = DAG.getConstantFP( - APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT); + // Create the vector constant for (0x1.0p39f + 0x1.0p23f). + SDValue VecCstFSub = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT); // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // NOTE: By using fsub of a positive constant instead of fadd of a negative + // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is + // enabled. See PR24512. SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); // TODO: Are there any fast-math-flags to propagate here? - SDValue FHigh = - DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); - // return (float4) lo + fhi; + // (float4) lo; SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); + // return (float4) lo + fhi; + if (IsStrict) { + SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other}, + {Op.getOperand(0), HighBitcast, VecCstFSub}); + return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other}, + {FHigh.getValue(1), LowBitcast, FHigh}); + } + + SDValue FHigh = + DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub); return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); } static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue N0 = Op.getOperand(0); + unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; + SDValue N0 = Op.getOperand(OpNo); MVT SrcVT = N0.getSimpleValueType(); SDLoc dl(Op); @@ -18783,18 +19283,23 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: - assert(!Subtarget.hasAVX512()); return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); + case MVT::v2i64: + case MVT::v4i64: + return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); } } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue N0 = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + unsigned OpNo = IsStrict ? 1 : 0; + SDValue Src = Op.getOperand(OpNo); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - MVT SrcVT = N0.getSimpleValueType(); - MVT DstVT = Op.getSimpleValueType(); + MVT SrcVT = Src.getSimpleValueType(); + MVT DstVT = Op->getSimpleValueType(0); + SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); @@ -18814,8 +19319,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Promote i32 to i64 and use a signed conversion on 64-bit targets. if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { - N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); - return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); + Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, + {Chain, Src}); + return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); } if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) @@ -18823,7 +19331,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); - if (SrcVT == MVT::i32 && X86ScalarSSEf64) + if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80) return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32) return SDValue(); @@ -18832,23 +19340,28 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); - SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, MachinePointerInfo()); + SDValue Store1 = + DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo()); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), OffsetSlot, MachinePointerInfo()); - SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); - return Fild; + std::pair<SDValue, SDValue> Tmp = + BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + if (IsStrict) + return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); + + return Tmp.first; } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue ValueToStore = Op.getOperand(0); - if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) + SDValue ValueToStore = Src; + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) { // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, - MachinePointerInfo()); + } + SDValue Store = + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, @@ -18863,32 +19376,42 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Ops[] = { Store, StackSlot }; SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MMO); + Chain = Fild.getValue(1); - APInt FF(32, 0x5F800000ULL); // Check whether the sign bit is set. SDValue SignSet = DAG.getSetCC( dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), - Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); + Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); - // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. + // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits. + APInt FF(64, 0x5F80000000000000ULL); SDValue FudgePtr = DAG.getConstantPool( - ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT); + ConstantInt::get(*DAG.getContext(), FF), PtrVT); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); - SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); + SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. - // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, /* Alignment = */ 4); + Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? + if (IsStrict) { + SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, + {Chain, Fild, Fudge}); + // STRICT_FP_ROUND can't handle equal types. + if (DstVT == MVT::f80) + return Add; + return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, + {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); + } SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); @@ -18902,11 +19425,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // result. SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned) const { + bool IsSigned, SDValue &Chain) const { + bool IsStrict = Op->isStrictFPOpcode(); SDLoc DL(Op); EVT DstTy = Op.getValueType(); - EVT TheVT = Op.getOperand(0).getValueType(); + SDValue Value = Op.getOperand(IsStrict ? 1 : 0); + EVT TheVT = Value.getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { @@ -18920,6 +19445,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // used for the 32-bit subtarget, but also for f80 on a 64-bit target. bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; + // FIXME: This does not generate an invalid exception if the input does not + // fit in i32. PR44019 if (!IsSigned && DstTy != MVT::i64) { // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. // The low 32 bits of the fist result will have the correct uint32 result. @@ -18938,8 +19465,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getEntryNode(); - SDValue Value = Op.getOperand(0); + Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. if (UnsignedFixup) { @@ -18949,8 +19476,9 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // of a signed i64. Let Thresh be the FP equivalent of // 0x8000000000000000ULL. // - // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; - // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Adjust = (Value < Thresh) ? 0 : 0x80000000; + // FltOfs = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value - FltOfs); // Fist-to-mem64 FistSrc // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent // to XOR'ing the high 32 bits with Adjust. @@ -18975,19 +19503,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); - SDValue Cmp = DAG.getSetCC(DL, - getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), TheVT), - Value, ThreshVal, ISD::SETLT); + EVT ResVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT); + SDValue Cmp; + if (IsStrict) { + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT, + Chain, /*IsSignaling*/ true); + Chain = Cmp.getValue(1); + } else { + Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT); + } + Adjust = DAG.getSelect(DL, MVT::i64, Cmp, DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(APInt::getSignMask(64), DL, MVT::i64)); - SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); - Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), TheVT), - Value, ThreshVal, ISD::SETLT); - Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, + DAG.getConstantFP(0.0, DL, TheVT), + ThreshVal); + + if (IsStrict) { + Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, + { Chain, Value, FltOfs }); + Chain = Value.getValue(1); + } else + Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs); } MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); @@ -19017,6 +19557,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, Ops, DstTy, MMO); SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); + Chain = Res.getValue(1); // If we need an unsigned fixup, XOR the result with adjust. if (UnsignedFixup) @@ -19036,7 +19577,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, assert(VT.isVector() && InVT.isVector() && "Expected vector type"); assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && "Unexpected extension opcode"); - assert(VT.getVectorNumElements() == VT.getVectorNumElements() && + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || @@ -19512,48 +20053,137 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { - bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; - MVT VT = Op.getSimpleValueType(); - SDValue Src = Op.getOperand(0); + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT; + MVT VT = Op->getSimpleValueType(0); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); - if (SrcVT == MVT::f128) { - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(SrcVT, VT); - else - LC = RTLIB::getFPTOUINT(SrcVT, VT); - - MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first; - } - if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + unsigned Opc; + if (IsStrict) + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + if (!IsSigned && !Subtarget.hasVLX()) { + assert(Subtarget.useAVX512Regs() && "Unexpected features!"); // Widen to 512-bits. ResVT = MVT::v8i32; TruncVT = MVT::v8i1; - Opc = ISD::FP_TO_UINT; - Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, - DAG.getUNDEF(MVT::v8f64), - Src, DAG.getIntPtrConstant(0, dl)); + Opc = Op.getOpcode(); + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + // TODO: Should we just do this for non-strict as well? + SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64) + : DAG.getUNDEF(MVT::v8f64); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src, + DAG.getIntPtrConstant(0, dl)); + } + SDValue Res, Chain; + if (IsStrict) { + Res = + DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Opc, dl, ResVT, Src); } - SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); + Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, - DAG.getIntPtrConstant(0, dl)); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; + } + + // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32. + if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) { + assert(!IsSigned && "Expected unsigned conversion!"); + assert(Subtarget.useAVX512Regs() && "Requires avx512f"); + return Op; + } + + // Widen vXi32 fp_to_uint with avx512f to 512-bit source. + if ((VT == MVT::v4i32 || VT == MVT::v8i32) && + (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) { + assert(!IsSigned && "Expected unsigned conversion!"); + assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && + "Unexpected features!"); + MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; + MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + // TODO: Should we just do this for non-strict as well? + SDValue Tmp = + IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, + DAG.getIntPtrConstant(0, dl)); + + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; + } + + // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. + if ((VT == MVT::v2i64 || VT == MVT::v4i64) && + (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) { + assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() && + !Subtarget.hasVLX() && "Unexpected features!"); + MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + // TODO: Should we just do this for non-strict as well? + SDValue Tmp = + IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, + DAG.getIntPtrConstant(0, dl)); + + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; } - assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { - return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32))); + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL"); + SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + if (IsStrict) { + unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI + : X86ISD::STRICT_CVTTP2UI; + return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); + } + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + return DAG.getNode(Opc, dl, VT, Tmp); } return SDValue(); @@ -19575,9 +20205,21 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { assert(VT == MVT::i32 && "Unexpected VT!"); // Promote i32 to i64 and use a signed operation on 64-bit targets. + // FIXME: This does not generate an invalid exception if the input does not + // fit in i32. PR44019 if (Subtarget.is64Bit()) { - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other}, + { Op.getOperand(0), Src }); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); + + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + if (IsStrict) + return DAG.getMergeValues({ Res, Chain }, dl); + return Res; } // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can @@ -19586,28 +20228,65 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - // Promote i16 to i32 if we can use a SSE operation. - if (VT == MVT::i16 && UseSSEReg) { + // Promote i16 to i32 if we can use a SSE operation or the type is f128. + // FIXME: This does not generate an invalid exception if the input does not + // fit in i16. PR44019 + if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) { assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"); - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other}, + { Op.getOperand(0), Src }); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + if (IsStrict) + return DAG.getMergeValues({ Res, Chain }, dl); + return Res; } - // If this is a SINT_TO_FP using SSEReg we're done. + // If this is a FP_TO_SINT using SSEReg we're done. if (UseSSEReg && IsSigned) return Op; + // fp128 needs to use a libcall. + if (SrcVT == MVT::f128) { + RTLIB::Libcall LC; + if (IsSigned) + LC = RTLIB::getFPTOSINT(SrcVT, VT); + else + LC = RTLIB::getFPTOUINT(SrcVT, VT); + + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + MakeLibCallOptions CallOptions; + std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, + SDLoc(Op), Chain); + + if (IsStrict) + return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + + return Tmp.first; + } + // Fall back to X87. - if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned)) + SDValue Chain; + if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { + if (IsStrict) + return DAG.getMergeValues({V, Chain}, dl); return V; + } llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op->isStrictFPOpcode(); + SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - SDValue In = Op.getOperand(0); + SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); if (VT == MVT::f128) { @@ -19617,14 +20296,19 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); - return DAG.getNode(X86ISD::VFPEXT, DL, VT, - DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, - In, DAG.getUNDEF(SVT))); + SDValue Res = + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT)); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, + {Op->getOperand(0), Res}); + return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); } SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op->isStrictFPOpcode(); + MVT VT = Op.getSimpleValueType(); - SDValue In = Op.getOperand(0); + SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); // It's legal except when f128 is involved @@ -19636,17 +20320,17 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. + + SDLoc dl(Op); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first; -} + std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions, + dl, Chain); -// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking -// the default expansion of STRICT_FP_ROUND. -static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) { - // FIXME: Need to form a libcall with an input chain for f128. - assert(Op.getOperand(0).getValueType() != MVT::f128 && - "Don't know how to handle f128 yet!"); - return Op; + if (IsStrict) + return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + + return Tmp.first; } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -19724,12 +20408,6 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::f128) { - RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128 - : RTLIB::SUB_F128; - return LowerF128Call(Op, DAG, LC); - } - assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); @@ -20013,6 +20691,19 @@ static bool hasNonFlagsUse(SDValue Op) { return false; } +// Transform to an x86-specific ALU node with flags if there is a chance of +// using an RMW op or only the flags are used. Otherwise, leave +// the node alone and emit a 'cmp' or 'test' instruction. +static bool isProfitableToUseFlagOp(SDValue Op) { + for (SDNode *U : Op->uses()) + if (U->getOpcode() != ISD::CopyToReg && + U->getOpcode() != ISD::SETCC && + U->getOpcode() != ISD::STORE) + return false; + + return true; +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, @@ -20076,15 +20767,8 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::SUB: case ISD::OR: case ISD::XOR: - // Transform to an x86-specific ALU node with flags if there is a chance of - // using an RMW op or only the flags are used. Otherwise, leave - // the node alone and emit a 'test' instruction. - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() != ISD::CopyToReg && - UI->getOpcode() != ISD::SETCC && - UI->getOpcode() != ISD::STORE) - goto default_case; + if (!isProfitableToUseFlagOp(Op)) + break; // Otherwise use a regular EFLAGS-setting instruction. switch (ArithOp.getOpcode()) { @@ -20112,7 +20796,6 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, Op->getOperand(1)).getValue(1); } default: - default_case: break; } @@ -20131,15 +20814,26 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. -SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, - const SDLoc &dl, SelectionDAG &DAG) const { +static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1, + unsigned X86CC, const SDLoc &dl, + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue Chain, bool IsSignaling) { if (isNullConstant(Op1)) - return EmitTest(Op0, X86CC, dl, DAG, Subtarget); + return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain); EVT CmpVT = Op0.getValueType(); - if (CmpVT.isFloatingPoint()) - return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); + if (CmpVT.isFloatingPoint()) { + if (Chain) { + SDValue Res = + DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, + dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); + return std::make_pair(Res, Res.getValue(1)); + } + return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1), + SDValue()); + } assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); @@ -20154,7 +20848,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { unsigned ExtendOp = - isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { // For equality comparisons try to use SIGN_EXTEND if the input was // truncate from something with enough sign bits. @@ -20178,10 +20872,22 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); } } + + // Try to shrink i64 compares if the input has enough zero bits. + // FIXME: Do this for non-constant compares for constant on LHS? + if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) && + Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. + cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 && + DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) { + CmpVT = MVT::i32; + Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); + Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); + } + // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return Sub.getValue(1); + return std::make_pair(Sub.getValue(1), SDValue()); } /// Convert a comparison if required by the subtarget. @@ -20189,16 +20895,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const { // If the subtarget does not support the FUCOMI instruction, floating-point // comparisons have to be converted. - if (Subtarget.hasCMov() || - Cmp.getOpcode() != X86ISD::CMP || - !Cmp.getOperand(0).getValueType().isFloatingPoint() || - !Cmp.getOperand(1).getValueType().isFloatingPoint()) + bool IsCmp = Cmp.getOpcode() == X86ISD::CMP; + bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP || + Cmp.getOpcode() == X86ISD::STRICT_FCMPS; + + if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) || + !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() || + !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint()) return Cmp; // The instruction selector will select an FUCOM instruction instead of // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence // build an SDNode sequence that transfers the result from FPSW into EFLAGS: - // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8)))) + // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8)))) SDLoc dl(Cmp); SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); @@ -20399,7 +21108,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, } else { // Use BT if the immediate can't be encoded in a TEST instruction or we // are optimizing for size and the immedaite won't fit in a byte. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && isPowerOf2_64(AndRHSVal)) { Src = AndLHS; @@ -20442,7 +21151,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, - SDValue &Op1) { + SDValue &Op1, bool &IsAlwaysSignaling) { unsigned SSECC; bool Swap = false; @@ -20481,6 +21190,22 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, if (Swap) std::swap(Op0, Op1); + switch (SetCCOpcode) { + default: + IsAlwaysSignaling = true; + break; + case ISD::SETEQ: + case ISD::SETOEQ: + case ISD::SETUEQ: + case ISD::SETNE: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETO: + case ISD::SETUO: + IsAlwaysSignaling = false; + break; + } + return SSECC; } @@ -20625,12 +21350,14 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - SDValue CC = Op.getOperand(2); - MVT VT = Op.getSimpleValueType(); + bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || + Op.getOpcode() == ISD::STRICT_FSETCCS; + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); + SDValue CC = Op.getOperand(IsStrict ? 3 : 2); + MVT VT = Op->getSimpleValueType(0); ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get(); - bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint(); + bool isFP = Op1.getSimpleValueType().isFloatingPoint(); SDLoc dl(Op); if (isFP) { @@ -20639,57 +21366,119 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, assert(EltVT == MVT::f32 || EltVT == MVT::f64); #endif + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + unsigned Opc; if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16); - Opc = X86ISD::CMPM; + Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { - Opc = X86ISD::CMPP; + Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP; // The SSE/AVX packed FP comparison nodes are defined with a // floating-point vector result that matches the operand type. This allows // them to work with an SSE1 target (integer vector types are not legal). VT = Op0.getSimpleValueType(); } - // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), - // emit two comparisons and a logic op to tie them together. SDValue Cmp; - unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); - if (SSECC >= 8 && !Subtarget.hasAVX()) { - // LLVM predicate is SETUEQ or SETONE. - unsigned CC0, CC1; - unsigned CombineOpc; - if (Cond == ISD::SETUEQ) { - CC0 = 3; // UNORD - CC1 = 0; // EQ - CombineOpc = X86ISD::FOR; + bool IsAlwaysSignaling; + unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling); + if (!Subtarget.hasAVX()) { + // TODO: We could use following steps to handle a quiet compare with + // signaling encodings. + // 1. Get ordered masks from a quiet ISD::SETO + // 2. Use the masks to mask potential unordered elements in operand A, B + // 3. Get the compare results of masked A, B + // 4. Calculating final result using the mask and result from 3 + // But currently, we just fall back to scalar operations. + if (IsStrict && IsAlwaysSignaling && !IsSignaling) + return SDValue(); + + // Insert an extra signaling instruction to raise exception. + if (IsStrict && !IsAlwaysSignaling && IsSignaling) { + SDValue SignalCmp = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS + // FIXME: It seems we need to update the flags of all new strict nodes. + // Otherwise, mayRaiseFPException in MI will return false due to + // NoFPExcept = false by default. However, I didn't find it in other + // patches. + SignalCmp->setFlags(Op->getFlags()); + Chain = SignalCmp.getValue(1); + } + + // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), + // emit two comparisons and a logic op to tie them together. + if (SSECC >= 8) { + // LLVM predicate is SETUEQ or SETONE. + unsigned CC0, CC1; + unsigned CombineOpc; + if (Cond == ISD::SETUEQ) { + CC0 = 3; // UNORD + CC1 = 0; // EQ + CombineOpc = X86ISD::FOR; + } else { + assert(Cond == ISD::SETONE); + CC0 = 7; // ORD + CC1 = 4; // NEQ + CombineOpc = X86ISD::FAND; + } + + SDValue Cmp0, Cmp1; + if (IsStrict) { + Cmp0 = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)}); + Cmp1 = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)}); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1), + Cmp1.getValue(1)); + } else { + Cmp0 = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)); + Cmp1 = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)); + } + Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { - assert(Cond == ISD::SETONE); - CC0 = 7; // ORD - CC1 = 4; // NEQ - CombineOpc = X86ISD::FAND; + if (IsStrict) { + Cmp = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); + Chain = Cmp.getValue(1); + } else + Cmp = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } - - SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getTargetConstant(CC0, dl, MVT::i8)); - SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getTargetConstant(CC1, dl, MVT::i8)); - Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { // Handle all other FP comparisons here. - Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getTargetConstant(SSECC, dl, MVT::i8)); + if (IsStrict) { + // Make a flip on already signaling CCs before setting bit 4 of AVX CC. + SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4; + Cmp = DAG.getNode( + Opc, dl, {VT, MVT::Other}, + {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); + Chain = Cmp.getValue(1); + } else + Cmp = DAG.getNode( + Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the // result type of SETCC. The bitcast is expected to be optimized away // during combining/isel. - if (Opc == X86ISD::CMPP) - Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + + if (IsStrict) + return DAG.getMergeValues({Cmp, Chain}, dl); return Cmp; } + assert(!IsStrict && "Strict SETCC only handles FP operands."); + MVT VTOp0 = Op0.getSimpleValueType(); (void)VTOp0; assert(VTOp0 == Op1.getSimpleValueType() && @@ -20860,6 +21649,30 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { assert(Subtarget.hasSSE2() && "Don't know how to lower!"); + // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle + // the odd elements over the even elements. + if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) { + Op0 = DAG.getConstant(0, dl, MVT::v4i32); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + static const int MaskHi[] = { 1, 1, 3, 3 }; + SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + return DAG.getBitcast(VT, Result); + } + + if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) { + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op1 = DAG.getConstant(-1, dl, MVT::v4i32); + + SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); + static const int MaskHi[] = { 1, 1, 3, 3 }; + SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); + + return DAG.getBitcast(VT, Result); + } + // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. The lower // compare is always unsigned. @@ -20999,8 +21812,9 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, /// corresponding X86 condition code constant in X86CC. SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, - SelectionDAG &DAG, - SDValue &X86CC) const { + SelectionDAG &DAG, SDValue &X86CC, + SDValue &Chain, + bool IsSignaling) const { // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). @@ -21043,12 +21857,32 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, } } + // Try to use the carry flag from the add in place of an separate CMP for: + // (seteq (add X, -1), -1). Similar for setne. + if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD && + Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + if (isProfitableToUseFlagOp(Op0)) { + SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); + + SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0), + Op0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New); + X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8); + return SDValue(New.getNode(), 1); + } + } + bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); if (CondCode == X86::COND_INVALID) return SDValue(); - SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG); + std::pair<SDValue, SDValue> Tmp = + EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling); + SDValue EFLAGS = Tmp.first; + if (Chain) + Chain = Tmp.second; EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; @@ -21056,35 +21890,48 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - MVT VT = Op.getSimpleValueType(); + bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || + Op.getOpcode() == ISD::STRICT_FSETCCS; + MVT VT = Op->getSimpleValueType(0); if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); assert(VT == MVT::i8 && "SetCC type must be 8-bit integer"); - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); SDLoc dl(Op); - ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + ISD::CondCode CC = + cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get(); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. if (Op0.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1); + softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain, + Op.getOpcode() == ISD::STRICT_FSETCCS); // If softenSetCCOperands returned a scalar, use it. if (!Op1.getNode()) { assert(Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); + if (IsStrict) + return DAG.getMergeValues({Op0, Chain}, dl); return Op0; } } SDValue X86CC; - SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); + SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain, + Op.getOpcode() == ISD::STRICT_FSETCCS); if (!EFLAGS) return SDValue(); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + + return Res; } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { @@ -21215,8 +22062,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); - unsigned SSECC = translateX86FSETCC( - cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); + bool IsAlwaysSignaling; + unsigned SSECC = + translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(), + CondOp0, CondOp1, IsAlwaysSignaling); if (Subtarget.hasAVX512()) { SDValue Cmp = @@ -21454,8 +22303,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (AddTest) { CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); - Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()), - X86::COND_NE, DL, DAG); + Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); } // a < b ? -1 : 0 -> RES = ~setcc_carry @@ -21711,7 +22559,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); assert(VT.isVector() && InVT.isVector() && "Expected vector type"); - assert(VT.getVectorNumElements() == VT.getVectorNumElements() && + assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Expected same number of elements"); assert((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || @@ -21765,12 +22613,14 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { "Expecting 256/512-bit op"); // Splitting volatile memory ops is not allowed unless the operation was not - // legal to begin with. We are assuming the input op is legal (this transform - // is only used for targets with AVX). + // legal to begin with. Assume the input store is legal (this transform is + // only used for targets with AVX). Note: It is possible that we have an + // illegal type like v2i128, and so we could allow splitting a volatile store + // in that case if that is important. if (!Store->isSimple()) return SDValue(); - MVT StoreVT = StoredVal.getSimpleValueType(); + EVT StoreVT = StoredVal.getValueType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; unsigned HalfAlign = (128 == HalfSize ? 16 : 32); @@ -22174,8 +23024,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); - Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()), - X86Cond, dl, DAG); + Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget); } Cond = ConvertCmpIfNecessary(Cond, DAG); return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), @@ -22201,7 +23050,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = Op.getConstantOperandVal(2); + MaybeAlign Alignment(Op.getConstantOperandVal(2)); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack @@ -22221,11 +23070,12 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - unsigned StackAlign = TFI.getStackAlignment(); + const Align StackAlign(TFI.getStackAlignment()); Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value - if (Align > StackAlign) - Result = DAG.getNode(ISD::AND, dl, VT, Result, - DAG.getConstant(-(uint64_t)Align, dl, VT)); + if (Alignment && Alignment > StackAlign) + Result = + DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -22256,9 +23106,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); - if (Align) { + if (Alignment) { SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, dl, VT)); + DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } @@ -22777,6 +23627,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, unsigned IntNo = Op.getConstantOperandVal(0); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: { @@ -22794,7 +23645,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1)); } case INTR_TYPE_1OP_SAE: { SDValue Sae = Op.getOperand(2); @@ -22866,7 +23718,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), - Src1, Src2, Src3); + {Src1, Src2, Src3}); } case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), @@ -22890,8 +23742,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode( + DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, + Subtarget, DAG); } case INTR_TYPE_1OP_MASK_SAE: { SDValue Src = Op.getOperand(1); @@ -22907,8 +23760,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else return SDValue(); - return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru, + Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); @@ -23114,8 +23967,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); } //default rounding mode - return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2), CC); + return DAG.getNode(IntrData->Opc0, dl, MaskVT, + {Op.getOperand(1), Op.getOperand(2), CC}); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -23315,8 +24168,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, - Mask); + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), + {Src, PassThru, Mask}); } case CVTPS2PH_MASK: { SDValue Src = Op.getOperand(1); @@ -23622,9 +24475,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue ShAmt = Op.getOperand(2); // If the argument is a constant, convert it to a target constant. if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) { - ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32); + // Clamp out of bounds shift amounts since they will otherwise be masked + // to 8-bits which may make it no longer out of bounds. + unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), - Op.getOperand(0), Op.getOperand(1), ShAmt); + Op.getOperand(0), Op.getOperand(1), + DAG.getTargetConstant(ShiftAmount, DL, MVT::i32)); } unsigned NewIntrinsic; @@ -23977,7 +24833,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MFI.setHasCopyImplyingStackAdjustment(true); // Don't do anything here, we will expand these intrinsics out later // during FinalizeISel in EmitInstrWithCustomInserter. - return SDValue(); + return Op; } case Intrinsic::x86_lwpins32: case Intrinsic::x86_lwpins64: @@ -24152,9 +25008,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + SDValue Offset = DAG.getUNDEF(VMask.getValueType()); - return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT, - MemIntr->getMemOperand(), true /* truncating */); + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask, + MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED, + true /* truncating */); } case X86ISD::VTRUNCUS: case X86ISD::VTRUNCS: { @@ -24249,7 +25107,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, +Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); @@ -24538,12 +25396,13 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - unsigned StackAlignment = TFI.getStackAlignment(); + const Align StackAlignment(TFI.getStackAlignment()); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot - int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false); + int SSFI = + MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); @@ -27464,12 +28323,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) return Op; - SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(), - N->getBasePtr(), Mask, - getZeroVector(VT, Subtarget, DAG, dl), - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType(), - N->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), + N->isExpandingLoad()); // Emit a blend. SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, PassThru); @@ -27503,11 +28361,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); - SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), - N->getBasePtr(), Mask, PassThru, - N->getMemoryVT(), N->getMemOperand(), - N->getExtensionType(), - N->isExpandingLoad()); + SDValue NewLoad = DAG.getMaskedLoad( + WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, + PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), + N->getExtensionType(), N->isExpandingLoad()); SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), @@ -27553,7 +28410,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); Mask = ExtendToType(Mask, WideMaskVT, DAG, true); return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), - Mask, N->getMemoryVT(), N->getMemOperand(), + N->getOffset(), Mask, N->getMemoryVT(), + N->getMemOperand(), N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore()); } @@ -27607,29 +28465,31 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl); } -SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, - SelectionDAG &DAG) const { - // TODO: Eventually, the lowering of these nodes should be informed by or - // deferred to the GC strategy for the function in which they appear. For - // now, however, they must be lowered to something. Since they are logically - // no-ops in the case of a null GC strategy (or a GC strategy which does not - // require special handling for these nodes), lower them as literal NOOPs for - // the time being. - SmallVector<SDValue, 2> Ops; +static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + MVT DstVT = Op.getSimpleValueType(); - Ops.push_back(Op.getOperand(0)); - if (Op->getGluedNode()) - Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); + AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode()); + unsigned SrcAS = N->getSrcAddressSpace(); - SDLoc OpDL(Op); - SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); + assert(SrcAS != N->getDestAddressSpace() && + "addrspacecast must be between different address spaces"); - return NOOP; + if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) { + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); + } else if (DstVT == MVT::i64) { + Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); + } else if (DstVT == MVT::i32) { + Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); + } else { + report_fatal_error("Bad address space in addrspacecast"); + } + return Op; } -SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op, + SelectionDAG &DAG) const { // TODO: Eventually, the lowering of these nodes should be informed by or // deferred to the GC strategy for the function in which they appear. For // now, however, they must be lowered to something. Since they are logically @@ -27651,9 +28511,21 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { - SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + + bool IsStrict = Op->isStrictFPOpcode(); + unsigned Offset = IsStrict ? 1 : 0; + SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end()); + + SDLoc dl(Op); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); MakeLibCallOptions CallOptions; - return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; + std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops, + CallOptions, dl, Chain); + + if (IsStrict) + return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); + + return Tmp.first; } /// Provide custom lowering hooks for some operations. @@ -27673,7 +28545,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); - case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG); + case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); @@ -27690,7 +28562,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); + case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::STRICT_UINT_TO_FP: case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); @@ -27700,21 +28574,24 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); - case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); - case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG); + case ISD::STRICT_FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); - case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); - case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); - case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCC: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); @@ -27778,8 +28655,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: - return LowerGC_TRANSITION_START(Op, DAG); - case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG); + case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); + case ISD::ADDRSPACECAST: + return LowerADDRSPACECAST(Op, DAG); } } @@ -27865,8 +28743,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case X86ISD::VPMADDWD: case X86ISD::AVG: { - // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and - // X86ISD::AVG/VPMADDWD by widening. + // Legalize types for X86ISD::AVG/VPMADDWD by widening. assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT VT = N->getValueType(0); @@ -28114,10 +28991,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: { - bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + case ISD::STRICT_FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_UINT: { + bool IsStrict = N->isStrictFPOpcode(); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT || + N->getOpcode() == ISD::STRICT_FP_TO_SINT; EVT VT = N->getValueType(0); - SDValue Src = N->getOperand(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (VT.isVector() && VT.getScalarSizeInBits() < 32) { @@ -28128,13 +29009,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), VT.getVectorNumElements()); - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); + SDValue Res; + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other}, + {N->getOperand(0), Src}); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); // Preserve what we know about the size of the original result. Except // when the result is v2i32 since we can't widen the assert. if (PromoteVT != MVT::v2i32) - Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext - : ISD::AssertSext, + Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl, PromoteVT, Res, DAG.getValueType(VT.getVectorElementType())); @@ -28149,6 +29036,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ConcatOps[0] = Res; Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); return; } @@ -28160,16 +29049,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { + unsigned Opc; + if (IsStrict) + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + + // If we have VLX we can emit a target specific FP_TO_UINT node,. if (!IsSigned && !Subtarget.hasVLX()) { - // If we have VLX we can emit a target specific FP_TO_UINT node, - // otherwise we can defer to the generic legalizer which will widen + // Otherwise we can defer to the generic legalizer which will widen // the input as well. This will be further widened during op // legalization to v8i32<-v8f64. - return; + // For strict nodes we'll need to widen ourselves. + // FIXME: Fix the type legalizer to safely widen strict nodes? + if (!IsStrict) + return; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src, + DAG.getConstantFP(0.0, dl, MVT::v2f64)); + Opc = N->getOpcode(); + } + SDValue Res; + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, + {N->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); } - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; - SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); + return; + } + + // Custom widen strict v2f32->v2i32 by padding with zeros. + // FIXME: Should generic type legalizer do this? + if (Src.getValueType() == MVT::v2f32 && IsStrict) { + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getConstantFP(0.0, dl, MVT::v2f32)); + SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); return; } @@ -28183,64 +29105,168 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); - unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; - // Using a 256-bit input here to guarantee 128-bit input for f32 case. - // TODO: Use 128-bit vectors for f64 case? - // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI. + unsigned NumElts = Subtarget.hasVLX() ? 2 : 8; + // If we use a 128-bit result we might need to use a target specific node. + unsigned SrcElts = + std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits()); MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts); - MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts); + MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts); + unsigned Opc = N->getOpcode(); + if (NumElts != SrcElts) { + if (IsStrict) + Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else + Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + } SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx); - Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res); + SDValue Chain; + if (IsStrict) { + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); + Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); return; } - if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned)) + SDValue Chain; + if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) { Results.push_back(V); + if (IsStrict) + Results.push_back(Chain); + } return; } - case ISD::SINT_TO_FP: { - assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); - SDValue Src = N->getOperand(0); - if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64) - return; - Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src)); - return; - } - case ISD::UINT_TO_FP: { - assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); + case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: { + bool IsStrict = N->isStrictFPOpcode(); + bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP || + N->getOpcode() == ISD::STRICT_SINT_TO_FP; EVT VT = N->getValueType(0); if (VT != MVT::v2f32) return; - SDValue Src = N->getOperand(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { - Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src)); + if (IsStrict) { + unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P + : X86ISD::STRICT_CVTUI2P; + SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } else { + unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; + Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src)); + } return; } + if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && + Subtarget.hasSSE41() && !Subtarget.hasAVX512()) { + SDValue Zero = DAG.getConstant(0, dl, SrcVT); + SDValue One = DAG.getConstant(1, dl, SrcVT); + SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, + DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), + DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); + SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); + SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); + for (int i = 0; i != 2; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + SignSrc, DAG.getIntPtrConstant(i, dl)); + if (IsStrict) + SignCvts[i] = + DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other}, + {N->getOperand(0), Src}); + else + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src); + }; + SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); + SDValue Slow, Chain; + if (IsStrict) { + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SignCvts[0].getValue(1), SignCvts[1].getValue(1)); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt); + } + IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg); + IsNeg = + DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1}); + SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt); + Results.push_back(Cvt); + if (IsStrict) + Results.push_back(Chain); + return; + } + if (SrcVT != MVT::v2i32) return; + + if (IsSigned || Subtarget.hasAVX512()) { + if (!IsStrict) + return; + + // Custom widen strict v2i32->v2f32 to avoid scalarization. + // FIXME: Should generic type legalizer do this? + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, + DAG.getConstant(0, dl, MVT::v2i32)); + SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), Src}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + return; + } + + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); SDValue VBias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64); SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); - // TODO: Are there any fast-math-flags to propagate here? - SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); - Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + if (IsStrict) { + SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, + {N->getOperand(0), Or, VBias}); + SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, + {MVT::v4f32, MVT::Other}, + {Sub.getValue(1), Sub}); + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } else { + // TODO: Are there any fast-math-flags to propagate here? + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + } return; } + case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: { - if (!isTypeLegal(N->getOperand(0).getValueType())) - return; - SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); + if (!isTypeLegal(Src.getValueType())) + return; + SDValue V; + if (IsStrict) + V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), N->getOperand(1)}); + else + V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); Results.push_back(V); + if (IsStrict) + Results.push_back(V.getValue(1)); return; } case ISD::FP_EXTEND: { @@ -28543,6 +29569,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Res.getValue(1)); return; } + case ISD::ADDRSPACECAST: { + SDValue Src = N->getOperand(0); + EVT DstVT = N->getValueType(0); + AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); + unsigned SrcAS = CastN->getSrcAddressSpace(); + + assert(SrcAS != CastN->getDestAddressSpace() && + "addrspacecast must be between different address spaces"); + + SDValue Res; + if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) + Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); + else if (DstVT == MVT::i64) + Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); + else if (DstVT == MVT::i32) + Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); + else + report_fatal_error("Unrecognized addrspacecast type legalization"); + + Results.push_back(Res); + return; + } } } @@ -28566,9 +29614,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CALL: return "X86ISD::CALL"; case X86ISD::BT: return "X86ISD::BT"; case X86ISD::CMP: return "X86ISD::CMP"; + case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP"; + case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS"; case X86ISD::COMI: return "X86ISD::COMI"; case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::CMPM: return "X86ISD::CMPM"; + case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM"; case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; @@ -28653,10 +29704,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT"; case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; + case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND"; case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; @@ -28676,6 +29729,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::VPPERM: return "X86ISD::VPPERM"; case X86ISD::CMPP: return "X86ISD::CMPP"; + case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; @@ -28776,6 +29830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE"; case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; @@ -28837,6 +29892,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI"; case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; + case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI"; + case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI"; case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; @@ -28847,6 +29904,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; + case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P"; + case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P"; case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; @@ -29099,8 +30158,8 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return true; } -bool -X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { +bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const { if (!Subtarget.hasAnyFMA()) return false; @@ -31518,28 +32577,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case X86ISD::VSRAI: case X86ISD::VSHLI: case X86ISD::VSRLI: { - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { - Known.setAllZero(); - break; - } + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt >= VT.getScalarSizeInBits()) { + Known.setAllZero(); + break; + } - Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - unsigned ShAmt = ShiftImm->getZExtValue(); - if (Opc == X86ISD::VSHLI) { - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; - // Low bits are known zero. - Known.Zero.setLowBits(ShAmt); - } else if (Opc == X86ISD::VSRLI) { - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); - // High bits are known zero. - Known.Zero.setHighBits(ShAmt); - } else { - Known.Zero.ashrInPlace(ShAmt); - Known.One.ashrInPlace(ShAmt); - } + Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Opc == X86ISD::VSHLI) { + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; + // Low bits are known zero. + Known.Zero.setLowBits(ShAmt); + } else if (Opc == X86ISD::VSRLI) { + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); + // High bits are known zero. + Known.Zero.setHighBits(ShAmt); + } else { + Known.Zero.ashrInPlace(ShAmt); + Known.One.ashrInPlace(ShAmt); } break; } @@ -32103,8 +33160,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) || ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) || ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) { - if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, - Subtarget)) { + if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, + Subtarget)) { DstVT = MaskVT; return true; } @@ -32116,8 +33173,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { - if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, - DAG, Subtarget)) { + if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, + Subtarget)) { SrcVT = DstVT = MaskVT; if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); @@ -32155,8 +33212,8 @@ static bool matchBinaryPermuteShuffle( uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); - if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, - ForceV2Zero, BlendMask)) { + if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, + ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector<int, 8> RepeatedMask; @@ -32410,10 +33467,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. - APInt Zeroable(NumMaskElts, 0); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (isUndefOrZero(Mask[i])) - Zeroable.setBit(i); + APInt KnownUndef, KnownZero; + resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); + APInt Zeroable = KnownUndef | KnownZero; if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load @@ -32834,7 +33890,8 @@ static SDValue combineX86ShuffleChainWithExtract( Offset += Src.getConstantOperandVal(1); Src = Src.getOperand(0); } - WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits()); + WideSizeInBits = std::max(WideSizeInBits, + (unsigned)Src.getValueSizeInBits()); assert((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"); Offset /= BaseVT.getVectorNumElements(); @@ -33026,6 +34083,10 @@ static SDValue combineX86ShufflesRecursively( ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth, bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + assert(RootMask.size() > 0 && + (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && + "Illegal shuffle root mask"); + // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. const unsigned MaxRecursionDepth = 8; @@ -33056,106 +34117,137 @@ static SDValue combineX86ShufflesRecursively( OpZero, DAG, Depth, false)) return SDValue(); - resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); - - // Add the inputs to the Ops list, avoiding duplicates. - SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); - - auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { - // Attempt to find an existing match. - SDValue InputBC = peekThroughBitcasts(Input); - for (int i = 0, e = Ops.size(); i < e; ++i) - if (InputBC == peekThroughBitcasts(Ops[i])) - return i; - // Match failed - should we replace an existing Op? - if (InsertionPoint >= 0) { - Ops[InsertionPoint] = Input; - return InsertionPoint; + SmallVector<int, 64> Mask; + SmallVector<SDValue, 16> Ops; + + // We don't need to merge masks if the root is empty. + bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1); + if (EmptyRoot) { + // Only resolve zeros if it will remove an input, otherwise we might end + // up in an infinite loop. + bool ResolveKnownZeros = true; + if (!OpZero.isNullValue()) { + APInt UsedInputs = APInt::getNullValue(OpInputs.size()); + for (int i = 0, e = OpMask.size(); i != e; ++i) { + int M = OpMask[i]; + if (OpUndef[i] || OpZero[i] || isUndefOrZero(M)) + continue; + UsedInputs.setBit(M / OpMask.size()); + if (UsedInputs.isAllOnesValue()) { + ResolveKnownZeros = false; + break; + } + } } - // Add to the end of the Ops list. - Ops.push_back(Input); - return Ops.size() - 1; - }; + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero, + ResolveKnownZeros); - SmallVector<int, 2> OpInputIdx; - for (SDValue OpInput : OpInputs) - OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); - - assert(((RootMask.size() > OpMask.size() && - RootMask.size() % OpMask.size() == 0) || - (OpMask.size() > RootMask.size() && - OpMask.size() % RootMask.size() == 0) || - OpMask.size() == RootMask.size()) && - "The smaller number of elements must divide the larger."); - - // This function can be performance-critical, so we rely on the power-of-2 - // knowledge that we have about the mask sizes to replace div/rem ops with - // bit-masks and shifts. - assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"); - assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); - unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); - unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); - - unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size()); - unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2); - unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2); - assert((RootRatio == 1 || OpRatio == 1) && - "Must not have a ratio for both incoming and op masks!"); - - assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); - assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); - assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); - unsigned RootRatioLog2 = countTrailingZeros(RootRatio); - unsigned OpRatioLog2 = countTrailingZeros(OpRatio); - - SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef); - - // Merge this shuffle operation's mask into our accumulated mask. Note that - // this shuffle's mask will be the first applied to the input, followed by the - // root mask to get us all the way to the root value arrangement. The reason - // for this order is that we are recursing up the operation chain. - for (unsigned i = 0; i < MaskWidth; ++i) { - unsigned RootIdx = i >> RootRatioLog2; - if (RootMask[RootIdx] < 0) { - // This is a zero or undef lane, we're done. - Mask[i] = RootMask[RootIdx]; - continue; - } + Mask = OpMask; + Ops.append(OpInputs.begin(), OpInputs.end()); + } else { + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); + + // Add the inputs to the Ops list, avoiding duplicates. + Ops.append(SrcOps.begin(), SrcOps.end()); + + auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { + // Attempt to find an existing match. + SDValue InputBC = peekThroughBitcasts(Input); + for (int i = 0, e = Ops.size(); i < e; ++i) + if (InputBC == peekThroughBitcasts(Ops[i])) + return i; + // Match failed - should we replace an existing Op? + if (InsertionPoint >= 0) { + Ops[InsertionPoint] = Input; + return InsertionPoint; + } + // Add to the end of the Ops list. + Ops.push_back(Input); + return Ops.size() - 1; + }; - unsigned RootMaskedIdx = - RootRatio == 1 - ? RootMask[RootIdx] - : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); + SmallVector<int, 2> OpInputIdx; + for (SDValue OpInput : OpInputs) + OpInputIdx.push_back( + AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); + + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + + // This function can be performance-critical, so we rely on the power-of-2 + // knowledge that we have about the mask sizes to replace div/rem ops with + // bit-masks and shifts. + assert(isPowerOf2_32(RootMask.size()) && + "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size()); + unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size()); + + unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size()); + unsigned RootRatio = + std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2); + unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2); + assert((RootRatio == 1 || OpRatio == 1) && + "Must not have a ratio for both incoming and op masks!"); + + assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"); + assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"); + unsigned RootRatioLog2 = countTrailingZeros(RootRatio); + unsigned OpRatioLog2 = countTrailingZeros(OpRatio); + + Mask.resize(MaskWidth, SM_SentinelUndef); + + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by + // the root mask to get us all the way to the root value arrangement. The + // reason for this order is that we are recursing up the operation chain. + for (unsigned i = 0; i < MaskWidth; ++i) { + unsigned RootIdx = i >> RootRatioLog2; + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask[i] = RootMask[RootIdx]; + continue; + } - // Just insert the scaled root mask value if it references an input other - // than the SrcOp we're currently inserting. - if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || - (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { - Mask[i] = RootMaskedIdx; - continue; - } + unsigned RootMaskedIdx = + RootRatio == 1 + ? RootMask[RootIdx] + : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); - RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); - unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; - if (OpMask[OpIdx] < 0) { - // The incoming lanes are zero or undef, it doesn't matter which ones we - // are using. - Mask[i] = OpMask[OpIdx]; - continue; - } + // Just insert the scaled root mask value if it references an input other + // than the SrcOp we're currently inserting. + if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || + (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { + Mask[i] = RootMaskedIdx; + continue; + } - // Ok, we have non-zero lanes, map them through to one of the Op's inputs. - unsigned OpMaskedIdx = - OpRatio == 1 - ? OpMask[OpIdx] - : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1)); + RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); + unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask[i] = OpMask[OpIdx]; + continue; + } - OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); - int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); - assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); - OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; + // Ok, we have non-zero lanes, map them through to one of the Op's inputs. + unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx] + : (OpMask[OpIdx] << OpRatioLog2) + + (RootMaskedIdx & (OpRatio - 1)); - Mask[i] = OpMaskedIdx; + OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); + int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); + assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"); + OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; + + Mask[i] = OpMaskedIdx; + } } // Remove unused/repeated shuffle source ops. @@ -33189,13 +34281,18 @@ static SDValue combineX86ShufflesRecursively( // the remaining recursion depth. if (Ops.size() < (MaxRecursionDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { + // For empty roots, we need to resolve zeroable elements before combining + // them with other shuffles. + SmallVector<int, 64> ResolvedMask = Mask; + if (EmptyRoot) + resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero); bool AllowVar = false; if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, - AllowVar, DAG, Subtarget)) + Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, + HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } } @@ -34207,6 +35304,15 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, In.getOperand(0).getValueType() == MVT::v2i64) return N->getOperand(0); // return the bitcast break; + case X86ISD::STRICT_CVTTP2SI: + case X86ISD::STRICT_CVTTP2UI: + case X86ISD::STRICT_CVTSI2P: + case X86ISD::STRICT_CVTUI2P: + case X86ISD::STRICT_VFPROUND: + if (In.getOperand(1).getValueType() == MVT::v2f64 || + In.getOperand(1).getValueType() == MVT::v2i64) + return N->getOperand(0); + break; } } @@ -34698,6 +35804,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; } + // If we don't demand all elements, then attempt to combine to a simpler + // shuffle. + // TODO: Handle other depths, but first we need to handle the fact that + // it might combine to the same shuffle. + if (!DemandedElts.isAllOnesValue() && Depth == 0) { + SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef); + for (int i = 0; i != NumElts; ++i) + if (DemandedElts[i]) + DemandedMask[i] = i; + + SDValue NewShuffle = combineX86ShufflesRecursively( + {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false, + /*AllowVarMask*/ true, TLO.DAG, Subtarget); + if (NewShuffle) + return TLO.CombineTo(Op, NewShuffle); + } + return false; } @@ -34739,117 +35862,110 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } case X86ISD::VSHLI: { SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) { - if (ShiftImm->getAPIntValue().uge(BitWidth)) - break; + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt >= BitWidth) + break; - unsigned ShAmt = ShiftImm->getZExtValue(); - APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); - - // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a - // single shift. We can do this if the bottom bits (which are shifted - // out) are never demanded. - if (Op0.getOpcode() == X86ISD::VSRLI && - OriginalDemandedBits.countTrailingZeros() >= ShAmt) { - if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) { - if (Shift2Imm->getAPIntValue().ult(BitWidth)) { - int Diff = ShAmt - Shift2Imm->getZExtValue(); - if (Diff == 0) - return TLO.CombineTo(Op, Op0.getOperand(0)); - - unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; - SDValue NewShift = TLO.DAG.getNode( - NewOpc, SDLoc(Op), VT, Op0.getOperand(0), - TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); - return TLO.CombineTo(Op, NewShift); - } - } + APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); + + // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a + // single shift. We can do this if the bottom bits (which are shifted + // out) are never demanded. + if (Op0.getOpcode() == X86ISD::VSRLI && + OriginalDemandedBits.countTrailingZeros() >= ShAmt) { + unsigned Shift2Amt = Op0.getConstantOperandVal(1); + if (Shift2Amt < BitWidth) { + int Diff = ShAmt - Shift2Amt; + if (Diff == 0) + return TLO.CombineTo(Op, Op0.getOperand(0)); + + unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; + SDValue NewShift = TLO.DAG.getNode( + NewOpc, SDLoc(Op), VT, Op0.getOperand(0), + TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); + return TLO.CombineTo(Op, NewShift); } + } - if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, - TLO, Depth + 1)) - return true; + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) + return true; - assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero <<= ShAmt; + Known.One <<= ShAmt; - // Low bits known zero. - Known.Zero.setLowBits(ShAmt); - } + // Low bits known zero. + Known.Zero.setLowBits(ShAmt); break; } case X86ISD::VSRLI: { - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { - if (ShiftImm->getAPIntValue().uge(BitWidth)) - break; + unsigned ShAmt = Op.getConstantOperandVal(1); + if (ShAmt >= BitWidth) + break; - unsigned ShAmt = ShiftImm->getZExtValue(); - APInt DemandedMask = OriginalDemandedBits << ShAmt; + APInt DemandedMask = OriginalDemandedBits << ShAmt; - if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, - OriginalDemandedElts, Known, TLO, Depth + 1)) - return true; + if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, + OriginalDemandedElts, Known, TLO, Depth + 1)) + return true; - assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); - // High bits known zero. - Known.Zero.setHighBits(ShAmt); - } + // High bits known zero. + Known.Zero.setHighBits(ShAmt); break; } case X86ISD::VSRAI: { SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) { - if (ShiftImm->getAPIntValue().uge(BitWidth)) - break; + unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue(); + if (ShAmt >= BitWidth) + break; - unsigned ShAmt = ShiftImm->getZExtValue(); - APInt DemandedMask = OriginalDemandedBits << ShAmt; + APInt DemandedMask = OriginalDemandedBits << ShAmt; - // If we just want the sign bit then we don't need to shift it. - if (OriginalDemandedBits.isSignMask()) - return TLO.CombineTo(Op, Op0); + // If we just want the sign bit then we don't need to shift it. + if (OriginalDemandedBits.isSignMask()) + return TLO.CombineTo(Op, Op0); - // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 - if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) { - SDValue Op00 = Op0.getOperand(0); - unsigned NumSignBits = - TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); - if (ShAmt < NumSignBits) - return TLO.CombineTo(Op, Op00); - } + // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 + if (Op0.getOpcode() == X86ISD::VSHLI && + Op.getOperand(1) == Op0.getOperand(1)) { + SDValue Op00 = Op0.getOperand(0); + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); + if (ShAmt < NumSignBits) + return TLO.CombineTo(Op, Op00); + } - // If any of the demanded bits are produced by the sign extension, we also - // demand the input sign bit. - if (OriginalDemandedBits.countLeadingZeros() < ShAmt) - DemandedMask.setSignBit(); + // If any of the demanded bits are produced by the sign extension, we also + // demand the input sign bit. + if (OriginalDemandedBits.countLeadingZeros() < ShAmt) + DemandedMask.setSignBit(); - if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, - TLO, Depth + 1)) - return true; + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, + TLO, Depth + 1)) + return true; - assert(!Known.hasConflict() && "Bits known to be one AND zero?"); - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + Known.Zero.lshrInPlace(ShAmt); + Known.One.lshrInPlace(ShAmt); - // If the input sign bit is known to be zero, or if none of the top bits - // are demanded, turn this into an unsigned shift right. - if (Known.Zero[BitWidth - ShAmt - 1] || - OriginalDemandedBits.countLeadingZeros() >= ShAmt) - return TLO.CombineTo( - Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); + // If the input sign bit is known to be zero, or if none of the top bits + // are demanded, turn this into an unsigned shift right. + if (Known.Zero[BitWidth - ShAmt - 1] || + OriginalDemandedBits.countLeadingZeros() >= ShAmt) + return TLO.CombineTo( + Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); - // High bits are known one. - if (Known.One[BitWidth - ShAmt - 1]) - Known.One.setHighBits(ShAmt); - } + // High bits are known one. + if (Known.One[BitWidth - ShAmt - 1]) + Known.One.setHighBits(ShAmt); break; } case X86ISD::PEXTRB: @@ -35005,6 +36121,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( return Vec; break; } + case X86ISD::PCMPGT: + // icmp sgt(0, R) == ashr(R, BitWidth-1). + // iff we only need the sign bit then we can use R directly. + if (DemandedBits.isSignMask() && + ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) + return Op.getOperand(1); + break; } APInt ShuffleUndef, ShuffleZero; @@ -35053,123 +36176,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } -/// Check if a vector extract from a target-specific shuffle of a load can be -/// folded into a single element load. -/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been custom lowered so we need to handle those here. -static SDValue -XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - SDValue InVec = N->getOperand(0); - SDValue EltNo = N->getOperand(1); - EVT EltVT = N->getValueType(0); - - if (!isa<ConstantSDNode>(EltNo)) - return SDValue(); - - EVT OriginalVT = InVec.getValueType(); - unsigned NumOriginalElts = OriginalVT.getVectorNumElements(); - - // Peek through bitcasts, don't duplicate a load with other uses. - InVec = peekThroughOneUseBitcasts(InVec); - - EVT CurrentVT = InVec.getValueType(); - if (!CurrentVT.isVector()) - return SDValue(); - - unsigned NumCurrentElts = CurrentVT.getVectorNumElements(); - if ((NumOriginalElts % NumCurrentElts) != 0) - return SDValue(); - - if (!isTargetShuffle(InVec.getOpcode())) - return SDValue(); - - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - - SmallVector<int, 16> ShuffleMask; - SmallVector<SDValue, 2> ShuffleOps; - bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, - ShuffleOps, ShuffleMask, UnaryShuffle)) - return SDValue(); - - unsigned Scale = NumOriginalElts / NumCurrentElts; - if (Scale > 1) { - SmallVector<int, 16> ScaledMask; - scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask); - ShuffleMask = std::move(ScaledMask); - } - assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch"); - - // Select the input vector, guarding against out of range extract vector. - int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt]; - - if (Idx == SM_SentinelZero) - return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) - : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); - if (Idx == SM_SentinelUndef) - return DAG.getUNDEF(EltVT); - - // Bail if any mask element is SM_SentinelZero - getVectorShuffle below - // won't handle it. - if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) - return SDValue(); - - assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) && - "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1]; - - // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = - (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; - - if (LdNode.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) - return SDValue(); - - AllowedUses = 1; // only allow 1 load use if we have a bitcast - LdNode = LdNode.getOperand(0); - } - - if (!ISD::isNormalLoad(LdNode.getNode())) - return SDValue(); - - LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); - - if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple()) - return SDValue(); - - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( - EltVT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) - return SDValue(); - - // All checks match so transform back to vector_shuffle so that DAG combiner - // can finish the job - SDLoc dl(N); - - // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT) - : DAG.getBitcast(OriginalVT, ShuffleOps[1]); - Shuffle = DAG.getVectorShuffle(OriginalVT, dl, - DAG.getBitcast(OriginalVT, ShuffleOps[0]), - Shuffle, ShuffleMask); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, - EltNo); -} - // Helper to peek through bitops/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>. static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { @@ -35714,7 +36720,7 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const X86Subtarget &Subtarget) { // Find the appropriate width for the PSADBW. EVT InVT = Zext0.getOperand(0).getValueType(); - unsigned RegSize = std::max(128u, InVT.getSizeInBits()); + unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits()); // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we // fill in the missing vector elements with 0. @@ -36263,6 +37269,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); + // We need at least SSE2 to anything here. + if (!Subtarget.hasSSE2()) + return SDValue(); + ISD::NodeType Opc; SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true); @@ -36382,8 +37392,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDLoc dl(InputVector); bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; + unsigned NumSrcElts = SrcVT.getVectorNumElements(); - if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements())) + if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts)) return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); // Integer Constant Folding. @@ -36419,14 +37430,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } // TODO - Remove this once we can handle the implicit zero-extension of - // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad, - // combineHorizontalPredicateResult and combineBasicSADPattern. + // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and + // combineBasicSADPattern. return SDValue(); } - if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) - return NewOp; - // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { @@ -36482,7 +37490,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, }; if (all_of(InputVector->uses(), IsBoolExtract) && BoolExtracts.size() > 1) { - unsigned NumSrcElts = SrcVT.getVectorNumElements(); EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); if (SDValue BC = combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { @@ -36568,9 +37575,8 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, if (TValIsAllZeros || FValIsAllOnes) { SDValue CC = Cond.getOperand(2); - ISD::CondCode NewCC = - ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), - Cond.getOperand(0).getValueType().isInteger()); + ISD::CondCode NewCC = ISD::getSetCCInverse( + cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType()); Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); std::swap(LHS, RHS); @@ -36761,37 +37767,117 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, if (VT.is512BitVector()) return SDValue(); - // TODO: Add other opcodes eventually lowered into BLEND. - for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); - UI != UE; ++UI) - if ((UI->getOpcode() != ISD::VSELECT && - UI->getOpcode() != X86ISD::BLENDV) || - UI.getOperandNo() != 0) + auto OnlyUsedAsSelectCond = [](SDValue Cond) { + for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); + UI != UE; ++UI) + if ((UI->getOpcode() != ISD::VSELECT && + UI->getOpcode() != X86ISD::BLENDV) || + UI.getOperandNo() != 0) + return false; + + return true; + }; + + if (OnlyUsedAsSelectCond(Cond)) { + APInt DemandedMask(APInt::getSignMask(BitWidth)); + KnownBits Known; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true)) return SDValue(); + // If we changed the computation somewhere in the DAG, this change will + // affect all users of Cond. Update all the nodes so that we do not use + // the generic VSELECT anymore. Otherwise, we may perform wrong + // optimizations as we messed with the actual expectation for the vector + // boolean values. + for (SDNode *U : Cond->uses()) { + if (U->getOpcode() == X86ISD::BLENDV) + continue; + + SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), + Cond, U->getOperand(1), U->getOperand(2)); + DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); + DCI.AddToWorklist(U); + } + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(N, 0); + } + + // Otherwise we can still at least try to simplify multiple use bits. APInt DemandedMask(APInt::getSignMask(BitWidth)); + APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements())); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); - if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true)) + if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask, + DemandedElts, DAG, 0)) + return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), + V, N->getOperand(1), N->getOperand(2)); + + return SDValue(); +} + +// Try to match: +// (or (and (M, (sub 0, X)), (pandn M, X))) +// which is a special case of: +// (select M, (sub 0, X), X) +// Per: +// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate +// We know that, if fNegate is 0 or 1: +// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) +// +// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: +// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) +// ( M ? -X : X) == ((X ^ M ) + (M & 1)) +// This lets us transform our vselect to: +// (add (xor X, M), (and M, 1)) +// And further to: +// (sub (xor X, M), M) +static SDValue combineLogicBlendIntoConditionalNegate( + EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + EVT MaskVT = Mask.getValueType(); + assert(MaskVT.isInteger() && + DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && + "Mask must be zero/all-bits"); + + if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) + return SDValue(); + if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) return SDValue(); - // If we changed the computation somewhere in the DAG, this change will - // affect all users of Cond. Update all the nodes so that we do not use - // the generic VSELECT anymore. Otherwise, we may perform wrong - // optimizations as we messed with the actual expectation for the vector - // boolean values. - for (SDNode *U : Cond->uses()) { - if (U->getOpcode() == X86ISD::BLENDV) - continue; + auto IsNegV = [](SDNode *N, SDValue V) { + return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && + ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); + }; - SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), - Cond, U->getOperand(1), U->getOperand(2)); - DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); - DCI.AddToWorklist(U); - } - DCI.CommitTargetLoweringOpt(TLO); - return SDValue(N, 0); + SDValue V; + if (IsNegV(Y.getNode(), X)) + V = X; + else if (IsNegV(X.getNode(), Y)) + V = Y; + else + return SDValue(); + + SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); + SDValue SubOp2 = Mask; + + // If the negate was on the false side of the select, then + // the operands of the SUB need to be swapped. PR 27251. + // This is because the pattern being matched above is + // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) + // but if the pattern matched was + // (vselect M, X, (sub (0, X))), that is really negation of the pattern + // above, -(vselect M, (sub 0, X), X), and therefore the replacement + // pattern also needs to be a negation of the replacement pattern above. + // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the + // sub accomplishes the negation of the replacement pattern. + if (V == Y) + std::swap(SubOp1, SubOp2); + + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); } /// Do target-specific dag combines on SELECT and VSELECT nodes. @@ -36811,10 +37897,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()); + + // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M). + // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT + // can't catch, plus vXi8 cases where we'd likely end up with BLENDV. + if (CondVT.isVector() && CondVT.isInteger() && + CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() && + (!CondConstantVector || CondVT.getScalarType() == MVT::i8) && + DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits()) + if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS, + DL, DAG, Subtarget)) + return V; // Convert vselects with constant condition into shuffles. - if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && - DCI.isBeforeLegalizeOps()) { + if (CondConstantVector && DCI.isBeforeLegalizeOps()) { SmallVector<int, 64> Mask; if (createShuffleMaskFromVSELECT(Mask, Cond)) return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); @@ -36843,7 +37940,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; @@ -36854,7 +37951,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, case ISD::SETOLE: // Converting this to a min would handle comparisons between positive // and negative zero incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMIN; @@ -36873,7 +37970,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, case ISD::SETOGE: // Converting this to a max would handle comparisons between positive // and negative zero incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) break; Opcode = X86ISD::FMAX; @@ -36883,7 +37980,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // the operands would cause it to handle comparisons between positive // and negative zero incorrectly. if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) break; @@ -36911,7 +38008,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a min would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !(DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS))) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) @@ -36922,8 +38019,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, break; case ISD::SETUGT: // Converting this to a min would handle NaNs incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && - (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) break; Opcode = X86ISD::FMIN; break; @@ -36948,7 +38044,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Converting this to a max would handle comparisons between positive // and negative zero incorrectly, and swapping the operands would // cause it to handle NaNs incorrectly. - if (!DAG.getTarget().Options.UnsafeFPMath && + if (!DAG.getTarget().Options.NoSignedZerosFPMath && !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) { if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) @@ -37093,7 +38189,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, SDValue Other; if (ISD::isBuildVectorAllZeros(LHS.getNode())) { Other = RHS; - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, VT.getVectorElementType()); } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) { Other = LHS; } @@ -37165,7 +38261,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, SDValue Other; if (ISD::isBuildVectorAllOnes(LHS.getNode())) { Other = RHS; - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, VT.getVectorElementType()); } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { Other = LHS; } @@ -37788,7 +38884,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, } /// Different mul shrinking modes. -enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; +enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { EVT VT = N->getOperand(0).getValueType(); @@ -37809,16 +38905,16 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); // When ranges are from -128 ~ 127, use MULS8 mode. if (MinSignBits >= 25) - Mode = MULS8; + Mode = ShrinkMode::MULS8; // When ranges are from 0 ~ 255, use MULU8 mode. else if (AllPositive && MinSignBits >= 24) - Mode = MULU8; + Mode = ShrinkMode::MULU8; // When ranges are from -32768 ~ 32767, use MULS16 mode. else if (MinSignBits >= 17) - Mode = MULS16; + Mode = ShrinkMode::MULS16; // When ranges are from 0 ~ 65535, use MULU16 mode. else if (AllPositive && MinSignBits >= 16) - Mode = MULU16; + Mode = ShrinkMode::MULU16; else return false; return true; @@ -37888,15 +38984,17 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); - if (Mode == MULU8 || Mode == MULS8) - return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8) + return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND + : ISD::SIGN_EXTEND, DL, VT, MulLo); MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. - SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - ReducedVT, NewN0, NewN1); + SDValue MulHi = + DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL, + ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider // result. @@ -38294,7 +39392,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { // We shift all of the values by one. In many cases we do not have // hardware support for this operation. This is better expressed as an ADD // of two values. - if (N1SplatC->getAPIntValue() == 1) + if (N1SplatC->isOne()) return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0); } @@ -38546,15 +39644,15 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type"); - assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type"); + assert(N->getOperand(1).getValueType() == MVT::i8 && + "Unexpected shift amount type"); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. - unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue(); + unsigned ShiftVal = N->getConstantOperandVal(1); if (ShiftVal >= NumBitsPerElt) { if (LogicalShift) return DAG.getConstant(0, SDLoc(N), VT); @@ -39094,6 +40192,71 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } + +// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C) +// Where C is a mask containing the same number of bits as the setcc and +// where the setcc will freely 0 upper bits of k-register. We can replace the +// undef in the concat with 0s and remove the AND. This mainly helps with +// v2i1/v4i1 setcc being casted to scalar. +static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); + + EVT VT = N->getValueType(0); + + // Make sure this is an AND with constant. We will check the value of the + // constant later. + if (!isa<ConstantSDNode>(N->getOperand(1))) + return SDValue(); + + // This is implied by the ConstantSDNode. + assert(!VT.isVector() && "Expected scalar VT!"); + + if (N->getOperand(0).getOpcode() != ISD::BITCAST || + !N->getOperand(0).hasOneUse() || + !N->getOperand(0).getOperand(0).hasOneUse()) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Src = N->getOperand(0).getOperand(0); + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 || + !TLI.isTypeLegal(SrcVT)) + return SDValue(); + + if (Src.getOpcode() != ISD::CONCAT_VECTORS) + return SDValue(); + + // We only care about the first subvector of the concat, we expect the + // other subvectors to be ignored due to the AND if we make the change. + SDValue SubVec = Src.getOperand(0); + EVT SubVecVT = SubVec.getValueType(); + + // First subvector should be a setcc with a legal result type. The RHS of the + // AND should be a mask with this many bits. + if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) || + !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements())) + return SDValue(); + + EVT SetccVT = SubVec.getOperand(0).getValueType(); + if (!TLI.isTypeLegal(SetccVT) || + !(Subtarget.hasVLX() || SetccVT.is512BitVector())) + return SDValue(); + + if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32)) + return SDValue(); + + // We passed all the checks. Rebuild the concat_vectors with zeroes + // and cast it back to VT. + SDLoc dl(N); + SmallVector<SDValue, 4> Ops(Src.getNumOperands(), + DAG.getConstant(0, dl, SubVecVT)); + Ops[0] = SubVec; + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, + Ops); + return DAG.getBitcast(VT, Concat); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -39132,9 +40295,12 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) && SrcOps.size() == 1) { SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) + Mask = DAG.getBitcast(MaskVT, SrcOps[0]); if (Mask) { APInt AllBits = APInt::getAllOnesValue(NumElts); return DAG.getSetCC(dl, MVT::i1, Mask, @@ -39143,6 +40309,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, } } + if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget)) + return V; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -39290,68 +40459,6 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { return true; } -// Try to match: -// (or (and (M, (sub 0, X)), (pandn M, X))) -// which is a special case of vselect: -// (vselect M, (sub 0, X), X) -// Per: -// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate -// We know that, if fNegate is 0 or 1: -// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) -// -// Here, we have a mask, M (all 1s or 0), and, similarly, we know that: -// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) -// ( M ? -X : X) == ((X ^ M ) + (M & 1)) -// This lets us transform our vselect to: -// (add (xor X, M), (and M, 1)) -// And further to: -// (sub (xor X, M), M) -static SDValue combineLogicBlendIntoConditionalNegate( - EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, - SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT MaskVT = Mask.getValueType(); - assert(MaskVT.isInteger() && - DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && - "Mask must be zero/all-bits"); - - if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) - return SDValue(); - if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) - return SDValue(); - - auto IsNegV = [](SDNode *N, SDValue V) { - return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && - ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); - }; - - SDValue V; - if (IsNegV(Y.getNode(), X)) - V = X; - else if (IsNegV(X.getNode(), Y)) - V = Y; - else - return SDValue(); - - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); - SDValue SubOp2 = Mask; - - // If the negate was on the false side of the select, then - // the operands of the SUB need to be swapped. PR 27251. - // This is because the pattern being matched above is - // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) - // but if the pattern matched was - // (vselect M, X, (sub (0, X))), that is really negation of the pattern - // above, -(vselect M, (sub 0, X), X), and therefore the replacement - // pattern also needs to be a negation of the replacement pattern above. - // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the - // sub accomplishes the negation of the replacement pattern. - if (V == Y) - std::swap(SubOp1, SubOp2); - - SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); - return DAG.getBitcast(VT, Res); -} - // Try to fold: // (or (and (m, y), (pandn m, x))) // into: @@ -39512,66 +40619,20 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return Ret; } -static SDValue combineOr(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If this is SSE1 only convert to FOR to avoid scalarization. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { - return DAG.getBitcast(MVT::v4i32, - DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, - DAG.getBitcast(MVT::v4f32, N0), - DAG.getBitcast(MVT::v4f32, N1))); - } - - // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. - // TODO: Support multiple SrcOps. - if (VT == MVT::i1) { - SmallVector<SDValue, 2> SrcOps; - if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && - SrcOps.size() == 1) { - SDLoc dl(N); - unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); - EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); - SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); - if (Mask) { - APInt AllBits = APInt::getNullValue(NumElts); - return DAG.getSetCC(dl, MVT::i1, Mask, - DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); - } - } - } - - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) - return R; - - if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) - return FPLogic; - - if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) - return R; - - if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) - return R; - - // Attempt to recursively combine an OR of shuffles. - if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { - SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) - return Res; - } - - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) || + !TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool OptForSize = DAG.shouldOptForSize(); unsigned Bits = VT.getScalarSizeInBits(); // SHLD/SHRD instructions have lower register pressure, but on some @@ -39589,11 +40650,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); + EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + SDValue ShAmt0 = N0.getOperand(1); - if (ShAmt0.getValueType() != MVT::i8) + if (ShAmt0.getValueType() != ShiftVT) return SDValue(); SDValue ShAmt1 = N1.getOperand(1); - if (ShAmt1.getValueType() != MVT::i8) + if (ShAmt1.getValueType() != ShiftVT) return SDValue(); // Peek through any modulo shift masks. @@ -39628,12 +40691,12 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, std::swap(ShMsk0, ShMsk1); } - auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1, - SDValue Amt) { + auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1, + SDValue Amt) { if (Opc == ISD::FSHR) std::swap(Op0, Op1); return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt)); + DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt)); }; // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) @@ -39674,7 +40737,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { if (Op1.getOpcode() == InnerShift && isa<ConstantSDNode>(Op1.getOperand(1)) && - Op1.getConstantOperandAPInt(1) == 1) { + Op1.getConstantOperandAPInt(1).isOneValue()) { return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); } // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). @@ -39689,6 +40752,70 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineOr(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // If this is SSE1 only convert to FOR to avoid scalarization. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { + return DAG.getBitcast(MVT::v4i32, + DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, + DAG.getBitcast(MVT::v4f32, N0), + DAG.getBitcast(MVT::v4f32, N1))); + } + + // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector<SDValue, 2> SrcOps; + if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) + Mask = DAG.getBitcast(MaskVT, SrcOps[0]); + if (Mask) { + APInt AllBits = APInt::getNullValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); + } + } + } + + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) + return R; + + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) + return R; + + if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) + return R; + + if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget)) + return R; + + // Attempt to recursively combine an OR of shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } + + return SDValue(); +} + /// Try to turn tests against the signbit in the form of: /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) /// into: @@ -39758,8 +40885,8 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, default: return SDValue(); case MVT::v16i8: case MVT::v8i16: - case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break; - case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break; + case MVT::v4i32: + case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: @@ -39783,7 +40910,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, // Create a greater-than comparison against -1. We don't use the more obvious // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. - return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); + return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT); } /// Detect patterns of truncation with unsigned saturation: @@ -39950,7 +41077,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) { - unsigned TruncOpc; + unsigned TruncOpc = 0; SDValue SatVal; if (auto SSatVal = detectSSatPattern(In, VT)) { SatVal = SSatVal; @@ -40252,6 +41379,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + assert(ML->isUnindexed() && "Unexpected indexed masked load!"); // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. // However, some target hooks may need to be added to know when the transform // is profitable. Endianness would also have to be considered. @@ -40279,6 +41407,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + assert(ML->isUnindexed() && "Unexpected indexed masked load!"); if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) return SDValue(); @@ -40314,10 +41443,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, // The new masked load has an undef pass-through operand. The select uses the // original pass-through operand. - SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(), - ML->getMask(), DAG.getUNDEF(VT), - ML->getMemoryVT(), ML->getMemOperand(), - ML->getExtensionType()); + SDValue NewML = DAG.getMaskedLoad( + VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(), + DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), + ML->getAddressingMode(), ML->getExtensionType()); SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru()); @@ -40403,8 +41532,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), Mst->getMemoryVT())) { return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); + Mst->getBasePtr(), Mst->getOffset(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), + Mst->getAddressingMode(), true); } return SDValue(); @@ -40593,59 +41723,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, cast<LoadSDNode>(St->getValue())->isSimple() && St->getChain().hasOneUse() && St->isSimple()) { LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode()); - SmallVector<SDValue, 8> Ops; if (!ISD::isNormalLoad(Ld)) return SDValue(); - // If this is not the MMX case, i.e. we are just turning i64 load/store - // into f64 load/store, avoid the transformation if there are multiple - // uses of the loaded value. - if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + // Avoid the transformation if there are multiple uses of the loaded value. + if (!Ld->hasNUsesOfValue(1, 0)) return SDValue(); SDLoc LdDL(Ld); SDLoc StDL(N); - // If we are a 64-bit capable x86, lower to a single movq load/store pair. - // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store - // pair instead. - if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; - SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), - Ld->getMemOperand()); - - // Make sure new load is placed in same chain order. - DAG.makeEquivalentMemoryOrdering(Ld, NewLd); - return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), - St->getMemOperand()); - } - - // Otherwise, lower to two pairs of 32-bit loads / stores. - SDValue LoAddr = Ld->getBasePtr(); - SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL); - - SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, - Ld->getPointerInfo(), Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, - Ld->getPointerInfo().getWithOffset(4), - MinAlign(Ld->getAlignment(), 4), - Ld->getMemOperand()->getFlags()); - // Make sure new loads are placed in same chain order. - DAG.makeEquivalentMemoryOrdering(Ld, LoLd); - DAG.makeEquivalentMemoryOrdering(Ld, HiLd); - - LoAddr = St->getBasePtr(); - HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); - - SDValue LoSt = - DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(), - St->getAlignment(), St->getMemOperand()->getFlags()); - SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr, - St->getPointerInfo().getWithOffset(4), - MinAlign(St->getAlignment(), 4), - St->getMemOperand()->getFlags()); - return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); + // Lower to a single movq load/store pair. + SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(), + Ld->getBasePtr(), Ld->getMemOperand()); + + // Make sure new load is placed in same chain order. + DAG.makeEquivalentMemoryOrdering(Ld, NewLd); + return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), + St->getMemOperand()); } // This is similar to the above case, but here we handle a scalar 64-bit @@ -41351,23 +42446,25 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { SDValue Op = peekThroughBitcasts(SDValue(N, 0)); EVT VT = Op->getValueType(0); - // Make sure the element size does't change. + + // Make sure the element size doesn't change. if (VT.getScalarSizeInBits() != ScalarSize) return SDValue(); - if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) { + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::VECTOR_SHUFFLE: { // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. - if (!SVOp->getOperand(1).isUndef()) + if (!Op.getOperand(1).isUndef()) return SDValue(); - if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1)) + if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1)) if (NegOp0.getValueType() == VT) // FIXME: Can we do better? - return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), - SVOp->getMask()); - return SDValue(); + return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT), + cast<ShuffleVectorSDNode>(Op)->getMask()); + break; } - unsigned Opc = Op.getOpcode(); - if (Opc == ISD::INSERT_VECTOR_ELT) { + case ISD::INSERT_VECTOR_ELT: { // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF, // -V, INDEX). SDValue InsVector = Op.getOperand(0); @@ -41378,34 +42475,35 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, NegInsVal, Op.getOperand(2)); - return SDValue(); + break; } + case ISD::FSUB: + case ISD::XOR: + case X86ISD::FXOR: { + SDValue Op1 = Op.getOperand(1); + SDValue Op0 = Op.getOperand(0); - if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB) - return SDValue(); - - SDValue Op1 = Op.getOperand(1); - SDValue Op0 = Op.getOperand(0); - - // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit - // masks. For FSUB, we have to check if constant bits of Op0 are sign bit - // masks and hence we swap the operands. - if (Opc == ISD::FSUB) - std::swap(Op0, Op1); + // For XOR and FXOR, we want to check if constant + // bits of Op1 are sign bit masks. For FSUB, we + // have to check if constant bits of Op0 are sign + // bit masks and hence we swap the operands. + if (Opc == ISD::FSUB) + std::swap(Op0, Op1); - APInt UndefElts; - SmallVector<APInt, 16> EltBits; - // Extract constant bits and see if they are all sign bit masks. Ignore the - // undef elements. - if (getTargetConstantBitsFromNode(Op1, ScalarSize, - UndefElts, EltBits, - /* AllowWholeUndefs */ true, - /* AllowPartialUndefs */ false)) { - for (unsigned I = 0, E = EltBits.size(); I < E; I++) - if (!UndefElts[I] && !EltBits[I].isSignMask()) - return SDValue(); + APInt UndefElts; + SmallVector<APInt, 16> EltBits; + // Extract constant bits and see if they are all + // sign bit masks. Ignore the undef elements. + if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, + /* AllowWholeUndefs */ true, + /* AllowPartialUndefs */ false)) { + for (unsigned I = 0, E = EltBits.size(); I < E; I++) + if (!UndefElts[I] && !EltBits[I].isSignMask()) + return SDValue(); - return peekThroughBitcasts(Op0); + return peekThroughBitcasts(Op0); + } + } } return SDValue(); @@ -41642,8 +42740,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { return SDValue(); SDValue LHS = N->getOperand(0); - auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC) + if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC) return SDValue(); X86::CondCode NewCC = X86::GetOppositeBranchCondition( @@ -41817,8 +42914,9 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); - // Only perform optimizations if UnsafeMath is used. - if (!DAG.getTarget().Options.UnsafeFPMath) + // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed. + if (!DAG.getTarget().Options.NoNaNsFPMath || + !DAG.getTarget().Options.NoSignedZerosFPMath) return SDValue(); // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes @@ -41943,6 +43041,7 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + // FIXME: Handle strict fp nodes. EVT VT = N->getValueType(0); // Convert a full vector load into vzload when not all bits are needed. @@ -41951,7 +43050,7 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); - LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); + LoadSDNode *LN = cast<LoadSDNode>(In); // Unless the load is volatile or atomic. if (LN->isSimple()) { SDLoc dl(N); @@ -42569,6 +43668,44 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a +/// recognizable memcmp expansion. +static bool isOrXorXorTree(SDValue X, bool Root = true) { + if (X.getOpcode() == ISD::OR) + return isOrXorXorTree(X.getOperand(0), false) && + isOrXorXorTree(X.getOperand(1), false); + if (Root) + return false; + return X.getOpcode() == ISD::XOR; +} + +/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp +/// expansion. +template<typename F> +static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG, + EVT VecVT, EVT CmpVT, bool HasPT, F SToV) { + SDValue Op0 = X.getOperand(0); + SDValue Op1 = X.getOperand(1); + if (X.getOpcode() == ISD::OR) { + SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV); + SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV); + if (VecVT != CmpVT) + return DAG.getNode(ISD::OR, DL, CmpVT, A, B); + if (HasPT) + return DAG.getNode(ISD::OR, DL, VecVT, A, B); + return DAG.getNode(ISD::AND, DL, CmpVT, A, B); + } else if (X.getOpcode() == ISD::XOR) { + SDValue A = SToV(Op0); + SDValue B = SToV(Op1); + if (VecVT != CmpVT) + return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE); + if (HasPT) + return DAG.getNode(ISD::XOR, DL, VecVT, A, B); + return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + } + llvm_unreachable("Impossible"); +} + /// Try to map a 128-bit or larger integer comparison to vector instructions /// before type legalization splits it up into chunks. static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, @@ -42589,10 +43726,8 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, // logically-combined vector-sized operands compared to zero. This pattern may // be generated by the memcmp expansion pass with oversized integer compares // (see PR33325). - bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR && - X.getOperand(0).getOpcode() == ISD::XOR && - X.getOperand(1).getOpcode() == ISD::XOR; - if (isNullConstant(Y) && !IsOrXorXorCCZero) + bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X); + if (isNullConstant(Y) && !IsOrXorXorTreeCCZero) return SDValue(); // Don't perform this combine if constructing the vector will be expensive. @@ -42602,66 +43737,102 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, X.getOpcode() == ISD::LOAD; }; if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && - !IsOrXorXorCCZero) + !IsOrXorXorTreeCCZero) return SDValue(); EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); bool HasAVX = Subtarget.hasAVX(); - // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512. + // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. + // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. if ((OpSize == 128 && Subtarget.hasSSE2()) || (OpSize == 256 && HasAVX) || (OpSize == 512 && Subtarget.useAVX512Regs())) { bool HasPT = Subtarget.hasSSE41(); + + // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened + // vector registers are essentially free. (Technically, widening registers + // prevents load folding, but the tradeoff is worth it.) + bool PreferKOT = Subtarget.preferMaskRegisters(); + bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512; + EVT VecVT = MVT::v16i8; - EVT CmpVT = MVT::v16i8; - if (OpSize == 256) - VecVT = CmpVT = MVT::v32i8; - if (OpSize == 512) { + EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT; + if (OpSize == 256) { + VecVT = MVT::v32i8; + CmpVT = PreferKOT ? MVT::v32i1 : VecVT; + } + EVT CastVT = VecVT; + bool NeedsAVX512FCast = false; + if (OpSize == 512 || NeedZExt) { if (Subtarget.hasBWI()) { VecVT = MVT::v64i8; CmpVT = MVT::v64i1; + if (OpSize == 512) + CastVT = VecVT; } else { VecVT = MVT::v16i32; CmpVT = MVT::v16i1; + CastVT = OpSize == 512 ? VecVT : + OpSize == 256 ? MVT::v8i32 : MVT::v4i32; + NeedsAVX512FCast = true; + } + } + + auto ScalarToVector = [&](SDValue X) -> SDValue { + bool TmpZext = false; + EVT TmpCastVT = CastVT; + if (X.getOpcode() == ISD::ZERO_EXTEND) { + SDValue OrigX = X.getOperand(0); + unsigned OrigSize = OrigX.getScalarValueSizeInBits(); + if (OrigSize < OpSize) { + if (OrigSize == 128) { + TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8; + X = OrigX; + TmpZext = true; + } else if (OrigSize == 256) { + TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8; + X = OrigX; + TmpZext = true; + } + } } - } + X = DAG.getBitcast(TmpCastVT, X); + if (!NeedZExt && !TmpZext) + return X; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, + DAG.getConstant(0, DL, VecVT), X, + DAG.getConstant(0, DL, VecIdxVT)); + }; SDValue Cmp; - if (IsOrXorXorCCZero) { + if (IsOrXorXorTreeCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne // Use 2 vector equality compares and 'and' the results before doing a // MOVMSK. - SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0)); - SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); - SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); - SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - if (VecVT == CmpVT && HasPT) { - SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B); - SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D); - Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2); - } else { - SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); - } + Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector); } else { - SDValue VecX = DAG.getBitcast(VecVT, X); - SDValue VecY = DAG.getBitcast(VecVT, Y); - if (VecVT == CmpVT && HasPT) { + SDValue VecX = ScalarToVector(X); + SDValue VecY = ScalarToVector(Y); + if (VecVT != CmpVT) { + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE); + } else if (HasPT) { Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); } else { Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); } } - // For 512-bits we want to emit a setcc that will lower to kortest. + // AVX512 should emit a setcc that will lower to kortest. if (VecVT != CmpVT) { - EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16; - SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT); - return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC); + EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : + CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16; + return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), + DAG.getConstant(0, DL, KRegVT), CC); } if (HasPT) { SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, @@ -42687,9 +43858,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); + const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + const SDValue LHS = N->getOperand(0); + const SDValue RHS = N->getOperand(1); EVT VT = N->getValueType(0); EVT OpVT = LHS.getValueType(); SDLoc DL(N); @@ -42716,30 +43887,35 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { - // Put build_vectors on the right. - if (LHS.getOpcode() == ISD::BUILD_VECTOR) { - std::swap(LHS, RHS); - CC = ISD::getSetCCSwappedOperands(CC); + // Using temporaries to avoid messing up operand ordering for later + // transformations if this doesn't work. + SDValue Op0 = LHS; + SDValue Op1 = RHS; + ISD::CondCode TmpCC = CC; + // Put build_vector on the right. + if (Op0.getOpcode() == ISD::BUILD_VECTOR) { + std::swap(Op0, Op1); + TmpCC = ISD::getSetCCSwappedOperands(TmpCC); } bool IsSEXT0 = - (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1); - bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + (Op0.getOpcode() == ISD::SIGN_EXTEND) && + (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1); + bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode()); if (IsSEXT0 && IsVZero1) { - assert(VT == LHS.getOperand(0).getValueType() && + assert(VT == Op0.getOperand(0).getValueType() && "Uexpected operand type"); - if (CC == ISD::SETGT) + if (TmpCC == ISD::SETGT) return DAG.getConstant(0, DL, VT); - if (CC == ISD::SETLE) + if (TmpCC == ISD::SETLE) return DAG.getConstant(1, DL, VT); - if (CC == ISD::SETEQ || CC == ISD::SETGE) - return DAG.getNOT(DL, LHS.getOperand(0), VT); + if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE) + return DAG.getNOT(DL, Op0.getOperand(0), VT); - assert((CC == ISD::SETNE || CC == ISD::SETLT) && + assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"); - return LHS.getOperand(0); + return Op0.getOperand(0); } } @@ -42752,8 +43928,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, VT.getVectorElementType() == MVT::i1 && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { - SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, - N->getOperand(2)); + SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC); return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc); } @@ -42985,16 +44160,18 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); - if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || - N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || - VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) + bool IsStrict = N->isStrictFPOpcode(); + SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); + if (!VT.isVector() || Op0->getOpcode() != ISD::AND || + Op0->getOperand(0)->getOpcode() != ISD::SETCC || + VT.getSizeInBits() != Op0.getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. - if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) { + if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); @@ -43004,12 +44181,19 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. - SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); + SDValue SourceConst; + if (IsStrict) + SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other}, + {N->getOperand(0), SDValue(BV, 0)}); + else + SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); - SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, - N->getOperand(0)->getOperand(0), MaskConst); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), + MaskConst); SDValue Res = DAG.getBitcast(VT, NewAnd); + if (IsStrict) + return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL); return Res; } @@ -43053,7 +44237,8 @@ static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - SDValue Op0 = N->getOperand(0); + bool IsStrict = N->isStrictFPOpcode(); + SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); @@ -43067,14 +44252,21 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. - if (DAG.SignBitIsZero(Op0)) + if (DAG.SignBitIsZero(Op0)) { + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other}, + {N->getOperand(0), Op0}); return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); + } return SDValue(); } @@ -43084,11 +44276,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. + bool IsStrict = N->isStrictFPOpcode(); if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) return Res; // Now move on to more general possibilities. - SDValue Op0 = N->getOperand(0); + SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); @@ -43100,6 +44293,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), P}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } @@ -43117,6 +44313,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, + {N->getOperand(0), Trunc}); return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); } // If we're after legalize and the type is v2i32 we need to shuffle and @@ -43125,6 +44324,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, { 0, 2, -1, -1 }); + if (IsStrict) + return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, + {N->getOperand(0), Shuf}); return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf); } } @@ -43148,13 +44350,16 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, if (Ld->isSimple() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && LdVT == MVT::i64) { - SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( + std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD( SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); - DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); - return FILDChain; + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second); + return Tmp.first; } } + if (IsStrict) + return SDValue(); + if (SDValue V = combineToFPTruncExtElt(N, DAG)) return V; @@ -43579,7 +44784,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, auto UsePMADDWD = [&](SDValue Op) { ShrinkMode Mode; return Op.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 && + canReduceVMulWidth(Op.getNode(), DAG, Mode) && + Mode != ShrinkMode::MULU16 && (!Subtarget.hasSSE41() || (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && Op->isOnlyUserOf(Op.getOperand(1).getNode()))); @@ -43784,7 +44990,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, // Check if the Mul source can be safely shrunk. ShrinkMode Mode; - if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16) + if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || + Mode == ShrinkMode::MULU16) return SDValue(); auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, @@ -44468,7 +45675,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); - EVT InVecBCVT = InVecBC.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && @@ -44512,31 +45718,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, VT, SDLoc(N), InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); - // Try to move vector bitcast after extract_subv by scaling extraction index: - // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') - // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR - if (InVec != InVecBC && InVecBCVT.isVector()) { - unsigned SrcNumElts = InVecBCVT.getVectorNumElements(); - unsigned DestNumElts = InVecVT.getVectorNumElements(); - if ((DestNumElts % SrcNumElts) == 0) { - unsigned DestSrcRatio = DestNumElts / SrcNumElts; - if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { - unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; - EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - InVecBCVT.getScalarType(), NewExtNumElts); - if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && - TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { - unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; - SDLoc DL(N); - SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); - SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - InVecBC, NewIndex); - return DAG.getBitcast(VT, NewExtract); - } - } - } - } - // If we are extracting from an insert into a zero vector, replace with a // smaller insert into zero if we don't access less than the original // subvector. Don't do this for i1 vectors. @@ -44583,7 +45764,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); } // v2f64 CVTUDQ2PD(v4i32). - if (InOpcode == ISD::UINT_TO_FP && + if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() && InVec.getOperand(0).getValueType() == MVT::v4i32) { return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); } @@ -44751,6 +45932,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return DAG.getConstant(0, SDLoc(N), VT); + APInt KnownUndef, KnownZero; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); @@ -44802,8 +45986,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); - case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); - case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); + case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + return combineSIntToFP(N, DAG, DCI, Subtarget); + case ISD::UINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6f7e90008de4..3a17099da38f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -598,6 +598,34 @@ namespace llvm { // For avx512-vp2intersect VP2INTERSECT, + /// X86 strict FP compare instructions. + STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCMPS, + + // Vector packed double/float comparison. + STRICT_CMPP, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + STRICT_CMPM, + + // Vector float/double to signed/unsigned integer with truncation. + STRICT_CVTTP2SI, STRICT_CVTTP2UI, + + // Vector FP extend. + STRICT_VFPEXT, + + // Vector FP round. + STRICT_VFPROUND, + + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + STRICT_VRNDSCALE, + + // Vector signed/unsigned integer to float/double. + STRICT_CVTSI2P, STRICT_CVTUI2P, + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, @@ -969,9 +997,7 @@ namespace llvm { unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { - if (ConstraintCode == "i") - return InlineAsm::Constraint_i; - else if (ConstraintCode == "o") + if (ConstraintCode == "o") return InlineAsm::Constraint_o; else if (ConstraintCode == "v") return InlineAsm::Constraint_v; @@ -1056,7 +1082,8 @@ namespace llvm { /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true, otherwise fmuladd is expanded to fmul + fadd. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; /// Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow @@ -1125,9 +1152,6 @@ namespace llvm { bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override; - bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, - bool IsSigned) const override; - /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, @@ -1165,7 +1189,7 @@ namespace llvm { return nullptr; // nothing to do, move along. } - Register getRegisterByName(const char* RegName, EVT VT, + Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; /// If a physical register, this returns the register that receives the @@ -1203,8 +1227,9 @@ namespace llvm { /// offset as appropriate. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; - SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, - SelectionDAG &DAG) const; + std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, + SDValue StackSlot, + SelectionDAG &DAG) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; @@ -1315,7 +1340,8 @@ namespace llvm { unsigned getAddressSpace(void) const; - SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const; + SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned, + SDValue &Chain) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -1340,6 +1366,7 @@ namespace llvm { SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -1358,8 +1385,7 @@ namespace llvm { SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; @@ -1477,20 +1503,15 @@ namespace llvm { MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; - /// Emit nodes that will be selected as "cmp Op0,Op1", or something - /// equivalent, for use with the given x86 condition code. - SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, - SelectionDAG &DAG) const; - /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; /// Emit flags for the given setcc condition and operands. Also returns the /// corresponding X86 condition code constant in X86CC. - SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, - ISD::CondCode CC, const SDLoc &dl, - SelectionDAG &DAG, - SDValue &X86CC) const; + SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + SDValue &X86CC, SDValue &Chain, + bool IsSignaling) const; /// Check if replacement of SQRT with RSQRT should be disabled. bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index cc0f59ab329d..48d0d8a35704 100644 --- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -48,12 +48,12 @@ private: static char ID; /// Machine instruction info used throughout the class. - const X86InstrInfo *TII; + const X86InstrInfo *TII = nullptr; /// Endbr opcode for the current machine function. - unsigned int EndbrOpcode; + unsigned int EndbrOpcode = 0; - /// Adds a new ENDBR instruction to the begining of the MBB. + /// Adds a new ENDBR instruction to the beginning of the MBB. /// The function will not add it if already exists. /// It will add ENDBR32 or ENDBR64 opcode, depending on the target. /// \returns true if the ENDBR was added and false otherwise. diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9b5de59430a5..32f012033fb0 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2078,7 +2078,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "$cc, $src2, $src1", "$src1, $src2, $cc", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; + timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -2089,8 +2089,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, timm:$cc), (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; + let Uses = [MXCSR] in defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), @@ -2111,7 +2112,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, timm:$cc))]>, - EVEX_4V, VEX_LIG, Sched<[sched]>; + EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), @@ -2121,7 +2122,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, (_.ScalarLdFrag addr:$src2), timm:$cc))]>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } @@ -2522,11 +2523,12 @@ def X86cmpm_imm_commute : SDNodeXForm<timm, [{ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), + (X86any_cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), 1>, Sched<[sched]>; @@ -2534,8 +2536,8 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), - timm:$cc), + (X86any_cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), timm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -2546,17 +2548,18 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, "vcmp"#_.Suffix, "$cc, ${src2}"#_.BroadcastStr#", $src1", "$src1, ${src2}"#_.BroadcastStr#", $cc", - (X86cmpm (_.VT _.RC:$src1), - (_.VT (_.BroadcastLdFrag addr:$src2)), - timm:$cc), + (X86any_cmpm (_.VT _.RC:$src1), + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } // Patterns for selecting with loads in other operand. - def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), - timm:$cc), + def : Pat<(X86any_cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), + timm:$cc), (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; @@ -2567,8 +2570,8 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2), - (_.VT _.RC:$src1), timm:$cc), + def : Pat<(X86any_cmpm (_.BroadcastLdFrag addr:$src2), + (_.VT _.RC:$src1), timm:$cc), (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; @@ -2582,6 +2585,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { // comparison code form (VCMP[EQ/LT/LE/...] + let Uses = [MXCSR] in defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, @@ -2639,7 +2643,7 @@ def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2), multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, Predicate prd> { - let Predicates = [prd], ExeDomain = _.ExeDomain in { + let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2679,7 +2683,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, string mem>{ - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -3197,8 +3201,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { -def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), timm:$cc)), +def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), timm:$cc)), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrri") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3215,8 +3219,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, timm:$cc), Narrow.KRC)>; // Broadcast load. -def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), - (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)), +def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbi") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3231,8 +3235,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, addr:$src2, timm:$cc), Narrow.KRC)>; // Commuted with broadcast load. -def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), - (Narrow.VT Narrow.RC:$src1), timm:$cc)), +def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), + (Narrow.VT Narrow.RC:$src1), timm:$cc)), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbi") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3928,6 +3932,17 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", (VMOVPQIto64Zrr GR64:$dst, VR128X:$src), 0>; +// Conversions between masks and scalar fp. +def : Pat<(v32i1 (bitconvert FR32X:$src)), + (KMOVDkr (VMOVSS2DIZrr FR32X:$src))>; +def : Pat<(f32 (bitconvert VK32:$src)), + (VMOVDI2SSZrr (KMOVDrk VK32:$src))>; + +def : Pat<(v64i1 (bitconvert FR64X:$src)), + (KMOVQkr (VMOVSDto64Zrr FR64X:$src))>; +def : Pat<(f64 (bitconvert VK64:$src)), + (VMOV64toSDZrr (KMOVQrk VK64:$src))>; + //===----------------------------------------------------------------------===// // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// @@ -5278,7 +5293,7 @@ defm : avx512_logical_lowering_types<"VPANDN", X86andnp>; multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, X86FoldableSchedWrite sched, bit IsCommutable> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -5312,7 +5327,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode VecNode, X86FoldableSchedWrite sched, bit IsCommutable = 0> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -5329,16 +5344,17 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, _.RC:$src2))>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, _.ScalarIntMemCPat:$src2))>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; - let isCodeGenOnly = 1, Predicates = [HasAVX512] in { + let isCodeGenOnly = 1, Predicates = [HasAVX512], + Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5356,6 +5372,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, EVEX2VEXOverride<EVEX2VexOvrd#"rm">; } + let Uses = [MXCSR] in defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", @@ -5391,13 +5408,13 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, NAME#"SD">, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } -defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds, +defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds, SchedWriteFAddSizes, 1>; -defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmuls, X86fmulRnds, +defm VMUL : avx512_binop_s_round<0x59, "vmul", any_fmul, X86fmuls, X86fmulRnds, SchedWriteFMulSizes, 1>; -defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubs, X86fsubRnds, +defm VSUB : avx512_binop_s_round<0x5C, "vsub", any_fsub, X86fsubs, X86fsubRnds, SchedWriteFAddSizes, 0>; -defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivs, X86fdivRnds, +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds, SchedWriteFDivSizes, 0>; defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs, SchedWriteFCmpSizes, 0>; @@ -5429,27 +5446,28 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, } defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, SchedWriteFCmp.Scl, "VMINCSS">, XS, - EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC; defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, SchedWriteFCmp.Scl, "VMINCSD">, XD, VEX_W, EVEX_4V, VEX_LIG, - EVEX_CD8<64, CD8VT1>; + EVEX_CD8<64, CD8VT1>, SIMD_EXC; defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, SchedWriteFCmp.Scl, "VMAXCSS">, XS, - EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC; defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, SchedWriteFCmp.Scl, "VMAXCSD">, XD, VEX_W, EVEX_4V, VEX_LIG, - EVEX_CD8<64, CD8VT1>; + EVEX_CD8<64, CD8VT1>, SIMD_EXC; multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, bit IsCommutable, bit IsKCommutable = IsCommutable> { - let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + let ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -5476,7 +5494,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -5487,7 +5505,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeSAE, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "{sae}, $src2, $src1", "$src1, $src2, {sae}", @@ -5526,6 +5544,7 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op } } +let Uses = [MXCSR] in multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, X86SchedWriteSizes sched> { defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM, @@ -5536,6 +5555,7 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } +let Uses = [MXCSR] in multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, X86SchedWriteSizes sched> { defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM, @@ -5546,16 +5566,16 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } -defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, +defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, HasAVX512, SchedWriteFAddSizes, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>; -defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, +defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, HasAVX512, SchedWriteFMulSizes, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, HasAVX512, SchedWriteFAddSizes>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>; -defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, HasAVX512, SchedWriteFDivSizes>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, @@ -5570,6 +5590,7 @@ let isCodeGenOnly = 1 in { defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, SchedWriteFCmpSizes, 1>; } +let Uses = []<Register>, mayRaiseFPException = 0 in { defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI, @@ -5578,10 +5599,11 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, SchedWriteFLogicSizes, 1>; +} multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -5603,7 +5625,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -6399,7 +6421,8 @@ let Predicates = [HasAVX512] in { multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", @@ -6425,7 +6448,8 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR] in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", @@ -6462,7 +6486,7 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, VEX_W; } -defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>; +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; @@ -6473,7 +6497,8 @@ defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", @@ -6500,7 +6525,8 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR] in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", @@ -6538,7 +6564,7 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, VEX_W; } -defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>; +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; @@ -6548,7 +6574,8 @@ defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubR multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", @@ -6578,7 +6605,8 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, + Uses = [MXCSR] in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", @@ -6616,7 +6644,7 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, VEX_W; } -defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>; +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; @@ -6630,14 +6658,15 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in { defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>, - AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>; + AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC; let mayLoad = 1 in defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>, - AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>; + AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC; + let Uses = [MXCSR] in defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>, @@ -6648,13 +6677,14 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in { (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>; + !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC; def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>; + [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC; + let Uses = [MXCSR] in def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc), !strconcat(OpcodeStr, @@ -6711,7 +6741,7 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>; defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; @@ -6918,7 +6948,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, } } -defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SS", +defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS", X86Movss, v4f32x_info, fp32imm0>; defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS", X86Movss, v4f32x_info, fp32imm0>; @@ -6927,7 +6957,7 @@ defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS", defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS", X86Movss, v4f32x_info, fp32imm0>; -defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SD", +defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD", X86Movsd, v2f64x_info, fp64imm0>; defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD", X86Movsd, v2f64x_info, fp64imm0>; @@ -6997,7 +7027,10 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h, multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched, RegisterClass SrcRC, X86VectorVTInfo DstVT, X86MemOperand x86memop, PatFrag ld_frag, string asm, - string mem> { + string mem, list<Register> _Uses = [MXCSR], + bit _mayRaiseFPException = 1> { +let ExeDomain = DstVT.ExeDomain, Uses = _Uses, + mayRaiseFPException = _mayRaiseFPException in { let hasSideEffects = 0, isCodeGenOnly = 1 in { def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst), (ins DstVT.FRC:$src1, SrcRC:$src), @@ -7023,6 +7056,7 @@ multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSched (OpNode (DstVT.VT DstVT.RC:$src1), (ld_frag addr:$src2)))]>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; +} def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst, DstVT.RC:$src1, SrcRC:$src2), 0, "att">; @@ -7032,6 +7066,7 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched, RegisterClass SrcRC, X86VectorVTInfo DstVT, string asm, string mem> { + let ExeDomain = DstVT.ExeDomain, Uses = [MXCSR] in def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), !strconcat(asm, @@ -7066,7 +7101,7 @@ defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32, - v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">, + v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l", [], 0>, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd, WriteCvtI2SD, GR64, @@ -7078,22 +7113,22 @@ def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; -def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), +def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), +def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), +def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), +def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (sint_to_fp GR32:$src)), +def : Pat<(f32 (any_sint_to_fp GR32:$src)), (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f32 (sint_to_fp GR64:$src)), +def : Pat<(f32 (any_sint_to_fp GR64:$src)), (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; -def : Pat<(f64 (sint_to_fp GR32:$src)), +def : Pat<(f64 (any_sint_to_fp GR32:$src)), (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f64 (sint_to_fp GR64:$src)), +def : Pat<(f64 (any_sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, @@ -7105,7 +7140,7 @@ defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info, - i32mem, loadi32, "cvtusi2sd", "l">, + i32mem, loadi32, "cvtusi2sd", "l", [], 0>, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd, WriteCvtI2SD, GR64, @@ -7117,22 +7152,22 @@ def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", (VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">; -def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), +def : Pat<(f32 (any_uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))), +def : Pat<(f32 (any_uint_to_fp (loadi64 addr:$src))), (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))), +def : Pat<(f64 (any_uint_to_fp (loadi32 addr:$src))), (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))), +def : Pat<(f64 (any_uint_to_fp (loadi64 addr:$src))), (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; -def : Pat<(f32 (uint_to_fp GR32:$src)), +def : Pat<(f32 (any_uint_to_fp GR32:$src)), (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f32 (uint_to_fp GR64:$src)), +def : Pat<(f32 (any_uint_to_fp GR64:$src)), (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; -def : Pat<(f64 (uint_to_fp GR32:$src)), +def : Pat<(f64 (any_uint_to_fp GR32:$src)), (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; -def : Pat<(f64 (uint_to_fp GR64:$src)), +def : Pat<(f64 (any_uint_to_fp GR64:$src)), (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } @@ -7145,11 +7180,12 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, SDNode OpNodeRnd, X86FoldableSchedWrite sched, string asm, string aliasStr> { - let Predicates = [HasAVX512] in { + let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in { def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src)))]>, - EVEX, VEX_LIG, Sched<[sched]>; + EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC; + let Uses = [MXCSR] in def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc), !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>, @@ -7159,7 +7195,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>, - EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } // Predicates = [HasAVX512] def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", @@ -7202,82 +7238,82 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2u let Predicates = [HasAVX512] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR64:$src)))))), (VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi64 addr:$src))))))), (VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR32:$src)))))), (VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi32 addr:$src))))))), (VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR64:$src)))))), (VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi64 addr:$src))))))), (VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR32:$src)))))), (VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi32 addr:$src))))))), (VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>; } // Predicates = [HasAVX512] @@ -7286,22 +7322,23 @@ multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, SDNode OpNodeInt, SDNode OpNodeSAE, X86FoldableSchedWrite sched, string aliasStr>{ -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in { let isCodeGenOnly = 1 in { def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, - EVEX, VEX_LIG, Sched<[sched]>; + EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC; def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, - EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNodeInt (_SrcRC.VT _SrcRC.RC:$src)))]>, - EVEX, VEX_LIG, Sched<[sched]>; + EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC; + let Uses = [MXCSR] in def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>, @@ -7311,7 +7348,7 @@ let Predicates = [HasAVX512] in { !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>, - EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; + EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } //HasAVX512 def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", @@ -7324,35 +7361,36 @@ let Predicates = [HasAVX512] in { } defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// +let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode, X86FoldableSchedWrite sched> { @@ -7387,6 +7425,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { + let Uses = [MXCSR] in defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", @@ -7399,6 +7438,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeRnd, X86FoldableSchedWrite sched> { + let Uses = [MXCSR] in defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -7435,28 +7475,28 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts, X86fpextsSAE, WriteCvtSS2SD, f32x_info, f64x_info>; -def : Pat<(f64 (fpextend FR32X:$src)), +def : Pat<(f64 (any_fpextend FR32X:$src)), (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>, Requires<[HasAVX512]>; -def : Pat<(f64 (fpextend (loadf32 addr:$src))), +def : Pat<(f64 (any_fpextend (loadf32 addr:$src))), (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; -def : Pat<(f32 (fpround FR64X:$src)), +def : Pat<(f32 (any_fpround FR64X:$src)), (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, Requires<[HasAVX512]>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector - (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), + (f32 (any_fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>, Requires<[HasAVX512]>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector - (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), + (f64 (any_fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>, Requires<[HasAVX512]>; @@ -7472,7 +7512,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, string Alias = "", X86MemOperand MemOp = _Src.MemOp, RegisterClass MaskRC = _.KRCWM, dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> { - +let Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), (ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src), @@ -7512,11 +7552,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, _.RC:$src0), vselect, "$src0 = $dst">, EVEX, EVEX_B, Sched<[sched.Folded]>; + } } // Coversion with SAE - suppress all exceptions multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { + let Uses = [MXCSR] in defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", @@ -7528,6 +7570,7 @@ multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeRnd, X86FoldableSchedWrite sched> { + let Uses = [MXCSR] in defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", @@ -7551,14 +7594,14 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info, - fpextend, sched.ZMM>, + any_fpextend, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, X86vfpextSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info, - X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend, + X86any_vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, any_fpextend, sched.YMM>, EVEX_V256; } } @@ -7566,7 +7609,7 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, // Truncate Double to Float multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfpround, sched.ZMM>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86any_vfpround, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfproundRnd, sched.ZMM>, EVEX_V512; } @@ -7574,7 +7617,7 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86vfpround, + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86any_vfpround, sched.YMM, "{1to4}", "{y}">, EVEX_V256; } @@ -7624,70 +7667,10 @@ defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>, PS, EVEX_CD8<32, CD8VH>; -let Predicates = [HasAVX512] in { - def : Pat<(v8f32 (fpround (v8f64 VR512:$src))), - (VCVTPD2PSZrr VR512:$src)>; - def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))), - VR256X:$src0), - (VCVTPD2PSZrrk VR256X:$src0, VK8WM:$mask, VR512:$src)>; - def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))), - v8f32x_info.ImmAllZerosV), - (VCVTPD2PSZrrkz VK8WM:$mask, VR512:$src)>; - - def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), - (VCVTPD2PSZrm addr:$src)>; - def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))), - VR256X:$src0), - (VCVTPD2PSZrmk VR256X:$src0, VK8WM:$mask, addr:$src)>; - def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))), - v8f32x_info.ImmAllZerosV), - (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; - - def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))), - (VCVTPD2PSZrmb addr:$src)>; - def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcastld64 addr:$src))), - (v8f32 VR256X:$src0)), - (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; - def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcastld64 addr:$src))), - v8f32x_info.ImmAllZerosV), - (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; -} - let Predicates = [HasVLX] in { - def : Pat<(v4f32 (fpround (v4f64 VR256X:$src))), - (VCVTPD2PSZ256rr VR256X:$src)>; - def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))), - VR128X:$src0), - (VCVTPD2PSZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>; - def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))), - v4f32x_info.ImmAllZerosV), - (VCVTPD2PSZ256rrkz VK4WM:$mask, VR256X:$src)>; - - def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), - (VCVTPD2PSZ256rm addr:$src)>; - def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))), - VR128X:$src0), - (VCVTPD2PSZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))), - v4f32x_info.ImmAllZerosV), - (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; - - def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), - (VCVTPD2PSZ256rmb addr:$src)>; - def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), - VR128X:$src0), - (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), - v4f32x_info.ImmAllZerosV), - (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; - // Special patterns to allow use of X86vmfpround for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(X86vfpround (v2f64 VR128X:$src)), + def : Pat<(X86any_vfpround (v2f64 VR128X:$src)), (VCVTPD2PSZ128rr VR128X:$src)>; def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -7696,7 +7679,7 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; - def : Pat<(X86vfpround (loadv2f64 addr:$src)), + def : Pat<(X86any_vfpround (loadv2f64 addr:$src)), (VCVTPD2PSZ128rm addr:$src)>; def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -7705,7 +7688,7 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))), + def : Pat<(X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src))), (VCVTPD2PSZ128rmb addr:$src)>; def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -7716,6 +7699,7 @@ let Predicates = [HasVLX] in { } // Convert Signed/Unsigned Doubleword to Double +let Uses = []<Register>, mayRaiseFPException = 0 in multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode OpNode128, X86SchedWriteWidths sched> { // No rounding in this op @@ -8075,34 +8059,34 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, VK4WM:$mask, i64mem:$src), 0, "att">; } -defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP, +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, X86any_VSintToFP, SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; -defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, +defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, PS, EVEX_CD8<32, CD8VF>; -defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si, +defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si, X86cvttp2siSAE, SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si, +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si, X86cvttp2siSAE, SchedWriteCvtPD2DQ>, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui, +defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui, X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui, +defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui, X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, PS, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, - X86VUintToFP, SchedWriteCvtDQ2PD>, XS, +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp, + X86any_VUintToFP, SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; -defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, +defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>; @@ -8138,35 +8122,35 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; -defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si, +defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si, X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si, +defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si, X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; -defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui, +defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui, X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui, +defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui, X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; -defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, +defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, +defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; -defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS, EVEX_CD8<64, CD8VF>; -defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD, EVEX_CD8<64, CD8VF>; @@ -8202,7 +8186,7 @@ let Predicates = [HasVLX] in { // Special patterns to allow use of X86mcvttp2si for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(v4i32 (X86cvttp2si (v2f64 VR128X:$src))), + def : Pat<(v4i32 (X86any_cvttp2si (v2f64 VR128X:$src))), (VCVTTPD2DQZ128rr VR128X:$src)>; def : Pat<(X86mcvttp2si (v2f64 VR128X:$src), (v4i32 VR128X:$src0), VK2WM:$mask), @@ -8211,7 +8195,7 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2DQZ128rrkz VK2WM:$mask, VR128X:$src)>; - def : Pat<(v4i32 (X86cvttp2si (loadv2f64 addr:$src))), + def : Pat<(v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))), (VCVTTPD2DQZ128rm addr:$src)>; def : Pat<(X86mcvttp2si (loadv2f64 addr:$src), (v4i32 VR128X:$src0), VK2WM:$mask), @@ -8220,7 +8204,7 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))), + def : Pat<(v4i32 (X86any_cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2DQZ128rmb addr:$src)>; def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), @@ -8260,7 +8244,7 @@ let Predicates = [HasVLX] in { // Special patterns to allow use of X86mcvtp2UInt for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))), + def : Pat<(v4i32 (X86any_cvttp2ui (v2f64 VR128X:$src))), (VCVTTPD2UDQZ128rr VR128X:$src)>; def : Pat<(X86mcvttp2ui (v2f64 VR128X:$src), (v4i32 VR128X:$src0), VK2WM:$mask), @@ -8269,7 +8253,7 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2UDQZ128rrkz VK2WM:$mask, VR128X:$src)>; - def : Pat<(v4i32 (X86cvttp2ui (loadv2f64 addr:$src))), + def : Pat<(v4i32 (X86any_cvttp2ui (loadv2f64 addr:$src))), (VCVTTPD2UDQZ128rm addr:$src)>; def : Pat<(X86mcvttp2ui (loadv2f64 addr:$src), (v4i32 VR128X:$src0), VK2WM:$mask), @@ -8278,7 +8262,7 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))), + def : Pat<(v4i32 (X86any_cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQZ128rmb addr:$src)>; def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), @@ -8311,7 +8295,7 @@ let Predicates = [HasDQI, HasVLX] in { v2i64x_info.ImmAllZerosV)), (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + def : Pat<(v2i64 (X86any_cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTTPS2QQZ128rm addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), @@ -8322,7 +8306,7 @@ let Predicates = [HasDQI, HasVLX] in { v2i64x_info.ImmAllZerosV)), (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), + def : Pat<(v2i64 (X86any_cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTTPS2UQQZ128rm addr:$src)>; def : Pat<(v2i64 (vselect VK2WM:$mask, (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), @@ -8334,63 +8318,26 @@ let Predicates = [HasDQI, HasVLX] in { (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; } -let Predicates = [HasAVX512, NoVLX] in { -def : Pat<(v8i32 (X86cvttp2ui (v8f32 VR256X:$src1))), - (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v4i32 (X86cvttp2ui (v4f32 VR128X:$src1))), - (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr - (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; - -def : Pat<(v4i32 (X86cvttp2ui (v4f64 VR256X:$src1))), - (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr - (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_xmm)>; - -def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), - (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), - (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; - -def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), - (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr - (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_ymm)>; - -def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), - (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr - (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; -} - let Predicates = [HasVLX] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDZ128rm addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), v2f64x_info.ImmAllZerosV)), (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v2f64 (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTUDQ2PDZ128rm addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), VR128X:$src0)), (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), v2f64x_info.ImmAllZerosV)), (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -8398,7 +8345,7 @@ let Predicates = [HasVLX] in { let Predicates = [HasDQI, HasVLX] in { // Special patterns to allow use of X86VMSintToFP for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(v4f32 (X86VSintToFP (v2i64 VR128X:$src))), + def : Pat<(v4f32 (X86any_VSintToFP (v2i64 VR128X:$src))), (VCVTQQ2PSZ128rr VR128X:$src)>; def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -8407,7 +8354,7 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; - def : Pat<(v4f32 (X86VSintToFP (loadv2i64 addr:$src))), + def : Pat<(v4f32 (X86any_VSintToFP (loadv2i64 addr:$src))), (VCVTQQ2PSZ128rm addr:$src)>; def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -8416,7 +8363,7 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), + def : Pat<(v4f32 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTQQ2PSZ128rmb addr:$src)>; def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -8427,7 +8374,7 @@ let Predicates = [HasDQI, HasVLX] in { // Special patterns to allow use of X86VMUintToFP for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(v4f32 (X86VUintToFP (v2i64 VR128X:$src))), + def : Pat<(v4f32 (X86any_VUintToFP (v2i64 VR128X:$src))), (VCVTUQQ2PSZ128rr VR128X:$src)>; def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -8436,7 +8383,7 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>; - def : Pat<(v4f32 (X86VUintToFP (loadv2i64 addr:$src))), + def : Pat<(v4f32 (X86any_VUintToFP (loadv2i64 addr:$src))), (VCVTUQQ2PSZ128rm addr:$src)>; def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -8445,7 +8392,7 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), + def : Pat<(v4f32 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTUQQ2PSZ128rmb addr:$src)>; def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), @@ -8455,72 +8402,11 @@ let Predicates = [HasDQI, HasVLX] in { (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } -let Predicates = [HasDQI, NoVLX] in { -def : Pat<(v2i64 (X86cvttp2si (v2f64 VR128X:$src1))), - (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr - (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; - -def : Pat<(v4i64 (X86cvttp2si (v4f32 VR128X:$src1))), - (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr - (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_ymm)>; - -def : Pat<(v4i64 (X86cvttp2si (v4f64 VR256X:$src1))), - (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr - (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v2i64 (X86cvttp2ui (v2f64 VR128X:$src1))), - (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr - (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; - -def : Pat<(v4i64 (X86cvttp2ui (v4f32 VR128X:$src1))), - (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr - (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_ymm)>; - -def : Pat<(v4i64 (X86cvttp2ui (v4f64 VR256X:$src1))), - (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr - (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))), - (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr - (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_xmm)>; - -def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))), - (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr - (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; - -def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))), - (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr - (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_ymm)>; - -def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))), - (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr - (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_xmm)>; - -def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))), - (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr - (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src1, sub_xmm)))), sub_xmm)>; - -def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), - (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr - (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), - VR256X:$src1, sub_ymm)))), sub_ymm)>; -} - //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// +let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, X86MemOperand x86memop, PatFrag ld_frag, X86FoldableSchedWrite sched> { @@ -8537,6 +8423,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, X86FoldableSchedWrite sched> { + let Uses = [MXCSR] in defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "{sae}, $src", "$src, {sae}", @@ -8568,7 +8455,7 @@ let Predicates = [HasVLX] in { multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> { -let ExeDomain = GenericDomain in { +let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -8605,7 +8492,7 @@ let ExeDomain = GenericDomain in { multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, SchedWrite Sched> { - let hasSideEffects = 0 in + let hasSideEffects = 0, Uses = [MXCSR] in defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, (outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), @@ -8664,52 +8551,51 @@ let Predicates = [HasVLX] in { // Unordered/Ordered scalar fp compare with Sae and set EFLAGS multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, - string OpcodeStr, X86FoldableSchedWrite sched> { - let hasSideEffects = 0 in + string OpcodeStr, Domain d, + X86FoldableSchedWrite sched = WriteFCom> { + let hasSideEffects = 0, Uses = [MXCSR] in def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { - defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", WriteFCom>, + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSEPackedSingle>, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", WriteFCom>, + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSEPackedDouble>, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; - defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", WriteFCom>, + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSEPackedSingle>, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", WriteFCom>, + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSEPackedDouble>, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { - defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, - "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG, + defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86any_fcmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, - "ucomisd", WriteFCom>, PD, EVEX, + defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86any_fcmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, X86strict_fcmps, f32, f32mem, loadf32, + "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, X86strict_fcmps, f64, f64mem, loadf64, + "comisd", SSEPackedDouble>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; - let Pattern = []<dag> in { - defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, - "comiss", WriteFCom>, PS, EVEX, VEX_LIG, - EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, - "comisd", WriteFCom>, PD, EVEX, - VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; - } let isCodeGenOnly = 1 in { defm VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG, + sse_load_f32, "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd", WriteFCom>, PD, EVEX, + sse_load_f64, "ucomisd", SSEPackedDouble>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; defm VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem, - sse_load_f32, "comiss", WriteFCom>, PS, EVEX, VEX_LIG, + sse_load_f32, "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem, - sse_load_f64, "comisd", WriteFCom>, PD, EVEX, + sse_load_f64, "comisd", SSEPackedDouble>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } @@ -8717,7 +8603,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { + let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in { defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -8767,6 +8653,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, } } +let Uses = [MXCSR] in multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched> { defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM, @@ -8798,12 +8685,12 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>; multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -8815,7 +8702,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } @@ -8840,7 +8727,7 @@ defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, X86FoldableSchedWrite sched> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", (OpNode (_.VT _.RC:$src))>, @@ -8862,7 +8749,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, } multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, X86FoldableSchedWrite sched> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", @@ -8923,25 +8810,26 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.VT (fsqrt _.RC:$src))>, EVEX, + (_.VT (any_fsqrt _.RC:$src))>, EVEX, Sched<[sched]>; defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", - (fsqrt (_.VT + (any_fsqrt (_.VT (bitconvert (_.LdFrag addr:$src))))>, EVEX, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, - (fsqrt (_.VT + (any_fsqrt (_.VT (_.BroadcastLdFrag addr:$src)))>, EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } +let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched> { defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), @@ -8967,6 +8855,7 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, } } +let Uses = [MXCSR] in multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched> { defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), @@ -8985,13 +8874,14 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri "$src2, $src1", "$src1, $src2", (X86fsqrts (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (X86fsqrts (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; + let Uses = [MXCSR] in defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -9004,23 +8894,23 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; let mayLoad = 1 in def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } let Predicates = [HasAVX512] in { - def : Pat<(_.EltVT (fsqrt _.FRC:$src)), + def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)), (!cast<Instruction>(Name#Zr) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; } let Predicates = [HasAVX512, OptForSize] in { - def : Pat<(_.EltVT (fsqrt (load addr:$src))), + def : Pat<(_.EltVT (any_fsqrt (load addr:$src))), (!cast<Instruction>(Name#Zm) (_.EltVT (IMPLICIT_DEF)), addr:$src)>; } @@ -9047,8 +8937,9 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 timm:$src3)))>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; + let Uses = [MXCSR] in defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", @@ -9062,30 +8953,30 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales _.RC:$src1, _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in { def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3), OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, Sched<[sched]>; + []>, Sched<[sched]>, SIMD_EXC; let mayLoad = 1 in def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } let Predicates = [HasAVX512] in { - def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2), + def : Pat<(X86any_VRndScale _.FRC:$src1, timm:$src2), (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src1, timm:$src2))>; } let Predicates = [HasAVX512, OptForSize] in { - def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2), + def : Pat<(X86any_VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2), (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), addr:$src1, timm:$src2))>; } @@ -9681,7 +9572,7 @@ defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>; defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>; // Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge -// ext+trunc aggresively making it impossible to legalize the DAG to this +// ext+trunc aggressively making it impossible to legalize the DAG to this // pattern directly. let Predicates = [HasAVX512, NoBWI] in { def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), @@ -10101,7 +9992,7 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256, //all instruction created with FROUND_CURRENT multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -10127,7 +10018,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, {sae}, $src1", @@ -10160,7 +10051,7 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, //all instruction created with FROUND_CURRENT multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10232,7 +10123,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, // op(reg_vec2,mem_scalar,imm) multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { + let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10254,7 +10145,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", @@ -10268,7 +10159,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in + let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", @@ -10350,7 +10241,7 @@ defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56 X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX; defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, - X86VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>, + X86any_VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>, @@ -10892,10 +10783,12 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// +let Uses = []<Register>, mayRaiseFPException = 0 in { defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512, SchedWriteFShuffleSizes, 0, 1>; defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512, SchedWriteFShuffleSizes>; +} defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, SchedWriteShuffle, HasBWI>; @@ -11587,7 +11480,8 @@ let Predicates = [HasVLX] in { multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo TblVT>{ - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, + Uses = [MXCSR], mayRaiseFPException = 1 in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -11619,7 +11513,7 @@ multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo TblVT> : avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> { -let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in { defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -11643,7 +11537,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (X86VFixupimms (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), - (i32 timm:$src4))>, Sched<[sched]>; + (i32 timm:$src4))>, Sched<[sched]>, SIMD_EXC; + let Uses = [MXCSR] in defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -11661,7 +11556,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (_src3VT.VT (scalar_to_vector (_src3VT.ScalarLdFrag addr:$src3))), (i32 timm:$src4))>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } @@ -11978,6 +11873,7 @@ let Constraints = "$src1 = $dst" in multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo VTI, bit IsCommutable> { + let ExeDomain = VTI.ExeDomain in { defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, "$src3, $src2", "$src2, $src3", @@ -12000,6 +11896,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, @@ -12164,7 +12061,7 @@ defm VGF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb", //===----------------------------------------------------------------------===// let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle, - Constraints = "$src1 = $dst" in { + Constraints = "$src1 = $dst", Uses = [MXCSR], mayRaiseFPException = 1 in { defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info, (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), "v4fmaddps", "$src3, $src2", "$src2, $src3", @@ -12210,9 +12107,9 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, } let hasSideEffects = 0 in { - let mayStore = 1 in + let mayStore = 1, SchedRW = [WriteFStoreX] in def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>; - let mayLoad = 1 in + let mayLoad = 1, SchedRW = [WriteFLoadX] in def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>; } @@ -12220,7 +12117,7 @@ let hasSideEffects = 0 in { // VP2INTERSECT //===----------------------------------------------------------------------===// -multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { +multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInfo _> { def rr : I<0x68, MRMSrcReg, (outs _.KRPC:$dst), (ins _.RC:$src1, _.RC:$src2), @@ -12228,7 +12125,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRPC:$dst, (X86vp2intersect _.RC:$src1, (_.VT _.RC:$src2)))]>, - EVEX_4V, T8XD; + EVEX_4V, T8XD, Sched<[sched]>; def rm : I<0x68, MRMSrcMem, (outs _.KRPC:$dst), @@ -12237,7 +12134,8 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRPC:$dst, (X86vp2intersect _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, - EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>; + EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : I<0x68, MRMSrcMem, (outs _.KRPC:$dst), @@ -12246,21 +12144,22 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), [(set _.KRPC:$dst, (X86vp2intersect _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>, - EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass avx512_vp2intersect<AVX512VLVectorVTInfo _> { +multiclass avx512_vp2intersect<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512, HasVP2INTERSECT] in - defm Z : avx512_vp2intersect_modes<_.info512>, EVEX_V512; + defm Z : avx512_vp2intersect_modes<sched.ZMM, _.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in { - defm Z256 : avx512_vp2intersect_modes<_.info256>, EVEX_V256; - defm Z128 : avx512_vp2intersect_modes<_.info128>, EVEX_V128; + defm Z256 : avx512_vp2intersect_modes<sched.YMM, _.info256>, EVEX_V256; + defm Z128 : avx512_vp2intersect_modes<sched.XMM, _.info128>, EVEX_V128; } } -defm VP2INTERSECTD : avx512_vp2intersect<avx512vl_i32_info>; -defm VP2INTERSECTQ : avx512_vp2intersect<avx512vl_i64_info>, VEX_W; +defm VP2INTERSECTD : avx512_vp2intersect<SchedWriteVecALU, avx512vl_i32_info>; +defm VP2INTERSECTQ : avx512_vp2intersect<SchedWriteVecALU, avx512vl_i64_info>, VEX_W; multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched, @@ -12293,17 +12192,19 @@ defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16", // Truncate Float to BFloat16 multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { - let Predicates = [HasBF16] in { + let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info, X86cvtneps2bf16, sched.ZMM>, EVEX_V512; } let Predicates = [HasBF16, HasVLX] in { + let Uses = []<Register>, mayRaiseFPException = 0 in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info, null_frag, sched.XMM, "{1to4}", "{x}", f128mem, VK4WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info, X86cvtneps2bf16, sched.YMM, "{1to8}", "{y}">, EVEX_V256; + } def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, @@ -12358,19 +12259,21 @@ let Predicates = [HasBF16, HasVLX] in { let Constraints = "$src1 = $dst" in { multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo src_v> { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, - EVEX_4V; + EVEX_4V, Sched<[sched]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, (src_v.VT (bitconvert - (src_v.LdFrag addr:$src3)))))>, EVEX_4V; + (src_v.LdFrag addr:$src3)))))>, EVEX_4V, + Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -12379,26 +12282,26 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat("$src2, ${src3}", _.BroadcastStr), (_.VT (OpNode _.RC:$src1, _.RC:$src2, (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>, - EVEX_B, EVEX_4V; + EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } } // Constraints = "$src1 = $dst" multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo src_v, Predicate prd> { let Predicates = [prd] in { - defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info512, + defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, src_v.info512>, EVEX_V512; } let Predicates = [HasVLX, prd] in { - defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info256, + defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256, src_v.info256>, EVEX_V256; - defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info128, + defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128, src_v.info128>, EVEX_V128; } } -defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, +defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA, avx512vl_f32_info, avx512vl_i32_info, HasBF16>, T8XS, EVEX_CD8<32, CD8VF>; diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td index e1e6eea59884..32faeb1a86f2 100644 --- a/llvm/lib/Target/X86/X86InstrControl.td +++ b/llvm/lib/Target/X86/X86InstrControl.td @@ -220,12 +220,12 @@ let isCall = 1 in // registers are added manually. let Uses = [ESP, SSP] in { def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i32imm_pcrel:$dst), + (outs), (ins i32imm_brtarget:$dst), "call{l}\t$dst", []>, OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; let hasSideEffects = 0 in def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, - (outs), (ins i16imm_pcrel:$dst), + (outs), (ins i16imm_brtarget:$dst), "call{w}\t$dst", []>, OpSize16, Sched<[WriteJump]>; def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst), @@ -285,7 +285,7 @@ let isCall = 1 in // Tail call stuff. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, Uses = [ESP, SSP] in { - def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset), + def TCRETURNdi : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$offset), []>, Sched<[WriteJump]>, NotMemoryFoldable; def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, Sched<[WriteJump]>, NotMemoryFoldable; @@ -293,7 +293,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset), []>, Sched<[WriteJumpLd]>; - def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst), + def TAILJMPd : PseudoI<(outs), (ins i32imm_brtarget:$dst), []>, Sched<[WriteJump]>; def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), @@ -309,10 +309,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, isCodeGenOnly = 1, SchedRW = [WriteJump] in let Uses = [ESP, EFLAGS, SSP] in { def TCRETURNdicc : PseudoI<(outs), - (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; + (ins i32imm_brtarget:$dst, i32imm:$offset, i32imm:$cond), + []>; // This gets substituted to a conditional jump instruction in MC lowering. - def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>; + def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$cond), []>; } @@ -328,7 +329,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { // that the offset between an arbitrary immediate and the call will fit in // the 32-bit pcrel field that we have. def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, - (outs), (ins i64i32imm_pcrel:$dst), + (outs), (ins i64i32imm_brtarget:$dst), "call{q}\t$dst", []>, OpSize32, Requires<[In64BitMode]>; def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), @@ -357,7 +358,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, Uses = [RSP, SSP] in { def TCRETURNdi64 : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset), + (ins i64i32imm_brtarget:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; def TCRETURNri64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), @@ -367,7 +368,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, (ins i64mem_TC:$dst, i32imm:$offset), []>, Sched<[WriteJumpLd]>, NotMemoryFoldable; - def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst), + def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_brtarget:$dst), []>, Sched<[WriteJump]>; def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), @@ -415,10 +416,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, isCodeGenOnly = 1, SchedRW = [WriteJump] in let Uses = [RSP, EFLAGS, SSP] in { def TCRETURNdi64cc : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset, + (ins i64i32imm_brtarget:$dst, i32imm:$offset, i32imm:$cond), []>; // This gets substituted to a conditional jump instruction in MC lowering. def TAILJMPd64_CC : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>; + (ins i64i32imm_brtarget:$dst, i32imm:$cond), []>; } diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index 0cca71bdc431..9e43a532a3f8 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -95,7 +95,8 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } -let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1, + Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, string Suff, PatFrag MemFrag128, PatFrag MemFrag256, @@ -122,7 +123,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS", - loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32, + loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32, SchedWriteFMA>; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32, @@ -137,7 +138,7 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD", - loadv2f64, loadv4f64, X86Fmadd, v2f64, + loadv2f64, loadv4f64, X86any_Fmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", loadv2f64, loadv4f64, X86Fmsub, v2f64, @@ -237,7 +238,7 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, } let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, - hasSideEffects = 0 in + hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, string PackTy, string Suff, SDNode OpNode, RegisterClass RC, @@ -263,7 +264,8 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, // the lowest element of the FMA*_Int instruction. Even though such analysis // may be not implemented yet we allow the routines doing the actual commute // transformation to decide if one or another instruction is commutable or not. -let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memopr, RegisterClass RC, X86FoldableSchedWrite sched> { @@ -317,7 +319,7 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, VR128, sdmem, sched>, VEX_W; } -defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadd, +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd, SchedWriteFMA.Scl>, VEX_LIG; defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub, SchedWriteFMA.Scl>, VEX_LIG; @@ -370,12 +372,12 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, } } -defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; -defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; @@ -384,6 +386,7 @@ defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR6 // FMA4 - AMD 4 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// +let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, PatFrag mem_frag, X86FoldableSchedWrite sched> { @@ -425,7 +428,8 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, ValueType VT, X86FoldableSchedWrite sched> { -let isCodeGenOnly = 1, hasSideEffects = 0 in { +let isCodeGenOnly = 1, hasSideEffects = 0, + Uses = [MXCSR], mayRaiseFPException = 1 in { def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -458,6 +462,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in { } // isCodeGenOnly = 1 } +let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT128, ValueType OpVT256, PatFrag ld_frag128, PatFrag ld_frag256, @@ -533,7 +538,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { let ExeDomain = SSEPackedSingle in { // Scalar Instructions - defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32, + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86any_Fmadd, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; @@ -550,7 +555,7 @@ let ExeDomain = SSEPackedSingle in { fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; // Packed Instructions - defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; @@ -566,7 +571,7 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { // Scalar Instructions - defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64, + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86any_Fmadd, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; @@ -583,7 +588,7 @@ let ExeDomain = SSEPackedDouble in { fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; // Packed Instructions - defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; @@ -624,12 +629,12 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name, } } -defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>; -defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>; diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td index 2ec6d50f9702..1830262205c6 100644 --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -29,7 +29,7 @@ def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, - [SDNPHasChain, SDNPInGlue, SDNPMayStore, + [SDNPHasChain, SDNPOptInGlue, SDNPMayStore, SDNPMemOperand]>; def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -37,7 +37,7 @@ def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, SDNPMemOperand]>; def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist, - [SDNPHasChain, SDNPInGlue, SDNPMayStore, + [SDNPHasChain, SDNPOptInGlue, SDNPMayStore, SDNPMemOperand]>; def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst, @@ -282,32 +282,32 @@ def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), !strconcat("fi", asmstring, "{l}\t$src")>; } -let Defs = [FPSW], Uses = [FPCW] in { +let Uses = [FPCW], mayRaiseFPException = 1 in { // FPBinary_rr just defines pseudo-instructions, no need to set a scheduling // resources. let hasNoSchedulingInfo = 1 in { -defm ADD : FPBinary_rr<fadd>; -defm SUB : FPBinary_rr<fsub>; -defm MUL : FPBinary_rr<fmul>; -defm DIV : FPBinary_rr<fdiv>; +defm ADD : FPBinary_rr<any_fadd>; +defm SUB : FPBinary_rr<any_fsub>; +defm MUL : FPBinary_rr<any_fmul>; +defm DIV : FPBinary_rr<any_fdiv>; } // Sets the scheduling resources for the actual NAME#_F<size>m defintions. let SchedRW = [WriteFAddLd] in { -defm ADD : FPBinary<fadd, MRM0m, "add">; -defm SUB : FPBinary<fsub, MRM4m, "sub">; -defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; +defm ADD : FPBinary<any_fadd, MRM0m, "add">; +defm SUB : FPBinary<any_fsub, MRM4m, "sub">; +defm SUBR: FPBinary<any_fsub ,MRM5m, "subr", 0>; } let SchedRW = [WriteFMulLd] in { -defm MUL : FPBinary<fmul, MRM1m, "mul">; +defm MUL : FPBinary<any_fmul, MRM1m, "mul">; } let SchedRW = [WriteFDivLd] in { -defm DIV : FPBinary<fdiv, MRM6m, "div">; -defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; +defm DIV : FPBinary<any_fdiv, MRM6m, "div">; +defm DIVR: FPBinary<any_fdiv, MRM7m, "divr", 0>; } -} // Defs = [FPSW] +} // Uses = [FPCW], mayRaiseFPException = 1 class FPST0rInst<Format fp, string asm> : FPI<0xD8, fp, (outs), (ins RSTi:$op), asm>; @@ -319,7 +319,7 @@ class FPrST0PInst<Format fp, string asm> // NOTE: GAS and apparently all other AT&T style assemblers have a broken notion // of some of the 'reverse' forms of the fsub and fdiv instructions. As such, // we have to put some 'r's in and take them out of weird places. -let SchedRW = [WriteFAdd], Defs = [FPSW], Uses = [FPCW] in { +let SchedRW = [WriteFAdd], Uses = [FPCW], mayRaiseFPException = 1 in { def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t{$op, %st|st, $op}">; def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st, $op|$op, st}">; def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t{%st, $op|$op, st}">; @@ -330,16 +330,16 @@ def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t{$op, %st|st, $op}">; def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st, $op|$op, st}">; def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t{%st, $op|$op, st}">; } // SchedRW -let SchedRW = [WriteFCom], Defs = [FPSW], Uses = [FPCW] in { +let SchedRW = [WriteFCom], Uses = [FPCW], mayRaiseFPException = 1 in { def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">; def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">; } // SchedRW -let SchedRW = [WriteFMul], Defs = [FPSW], Uses = [FPCW] in { +let SchedRW = [WriteFMul], Uses = [FPCW], mayRaiseFPException = 1 in { def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t{$op, %st|st, $op}">; def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st, $op|$op, st}">; def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t{%st, $op|$op, st}">; } // SchedRW -let SchedRW = [WriteFDiv], Defs = [FPSW], Uses = [FPCW] in { +let SchedRW = [WriteFDiv], Uses = [FPCW], mayRaiseFPException = 1 in { def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t{$op, %st|st, $op}">; def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st, $op|$op, st}">; def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t{%st, $op|$op, st}">; @@ -359,20 +359,14 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, def _F : FPI<0xD9, fp, (outs), (ins), asmstring>; } -let Defs = [FPSW], Uses = [FPCW] in { - let SchedRW = [WriteFSign] in { defm CHS : FPUnary<fneg, MRM_E0, "fchs">; defm ABS : FPUnary<fabs, MRM_E1, "fabs">; } +let Uses = [FPCW], mayRaiseFPException = 1 in { let SchedRW = [WriteFSqrt80] in -defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">; - -let SchedRW = [WriteMicrocoded] in { -defm SIN : FPUnary<fsin, MRM_FE, "fsin">; -defm COS : FPUnary<fcos, MRM_FF, "fcos">; -} +defm SQRT: FPUnary<any_fsqrt,MRM_FA, "fsqrt">; let SchedRW = [WriteFCom] in { let hasSideEffects = 0 in { @@ -383,11 +377,11 @@ def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">; } // SchedRW -} // Defs = [FPSW] +} // Uses = [FPCW], mayRaiseFPException = 1 // Versions of FP instructions that take a single memory operand. Added for the // disassembler; remove as they are included with patterns elsewhere. -let SchedRW = [WriteFComLd], Defs = [FPSW], Uses = [FPCW] in { +let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1 in { def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; @@ -402,14 +396,21 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; } // SchedRW let SchedRW = [WriteMicrocoded] in { +let Defs = [FPSW, FPCW] in { def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; -def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; - def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">; +} + +let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW] in { +def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">; +} + +let Uses = [FPSW] in def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">; def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; +let Uses = [FPCW] ,mayRaiseFPException = 1 in def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">; } // SchedRW @@ -435,7 +436,6 @@ multiclass FPCMov<PatLeaf cc> { Requires<[HasCMov]>; } -let Defs = [FPSW] in { let SchedRW = [WriteFCMOV] in { let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { defm CMOVB : FPCMov<X86_COND_B>; @@ -469,6 +469,7 @@ def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op), } // Predicates = [HasCMov] } // SchedRW +let mayRaiseFPException = 1 in { // Floating point loads & stores. let SchedRW = [WriteLoad], Uses = [FPCW] in { let canFoldAsLoad = 1 in { @@ -485,6 +486,7 @@ def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>; def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; +let mayRaiseFPException = 0 in { def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, [(set RFP32:$dst, (X86fild16 addr:$src))]>; def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, @@ -503,6 +505,7 @@ def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild32 addr:$src))]>; def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild64 addr:$src))]>; +} // mayRaiseFPException = 0 } // SchedRW let SchedRW = [WriteStore], Uses = [FPCW] in { @@ -546,10 +549,12 @@ let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">; def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">; +let mayRaiseFPException = 0 in { def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">; def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">; def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">; } +} let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in { def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">; def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">; @@ -621,7 +626,7 @@ def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">; let SchedRW = [WriteFLD1], Uses = [FPCW] in def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">; -let SchedRW = [WriteFLDC], Uses = [FPCW] in { +let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW] in { def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>; def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>; def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>; @@ -632,29 +637,44 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>; // Floating point compares. let SchedRW = [WriteFCom], Uses = [FPCW] in { def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, - [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; + [(set FPSW, (trunc (X86any_fcmp RFP32:$lhs, RFP32:$rhs)))]>; def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, - [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; + [(set FPSW, (trunc (X86any_fcmp RFP64:$lhs, RFP64:$rhs)))]>; def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, - [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; + [(set FPSW, (trunc (X86any_fcmp RFP80:$lhs, RFP80:$rhs)))]>; +def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set FPSW, (trunc (X86strict_fcmps RFP32:$lhs, RFP32:$rhs)))]>; +def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set FPSW, (trunc (X86strict_fcmps RFP64:$lhs, RFP64:$rhs)))]>; +def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set FPSW, (trunc (X86strict_fcmps RFP80:$lhs, RFP80:$rhs)))]>; } // SchedRW -} // Defs = [FPSW] +} // mayRaiseFPException = 1 -let SchedRW = [WriteFCom] in { +let SchedRW = [WriteFCom], mayRaiseFPException = 1 in { // CC = ST(0) cmp ST(i) -let Defs = [EFLAGS, FPSW], Uses = [FPCW] in { +let Defs = [EFLAGS, FPCW], Uses = [FPCW] in { def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, - [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>, + [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>, Requires<[FPStackf32, HasCMov]>; def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, - [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>, + [(set EFLAGS, (X86any_fcmp RFP64:$lhs, RFP64:$rhs))]>, Requires<[FPStackf64, HasCMov]>; def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, - [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>, + [(set EFLAGS, (X86any_fcmp RFP80:$lhs, RFP80:$rhs))]>, + Requires<[HasCMov]>; +def COM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set EFLAGS, (X86strict_fcmps RFP32:$lhs, RFP32:$rhs))]>, + Requires<[FPStackf32, HasCMov]>; +def COM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set EFLAGS, (X86strict_fcmps RFP64:$lhs, RFP64:$rhs))]>, + Requires<[FPStackf64, HasCMov]>; +def COM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set EFLAGS, (X86strict_fcmps RFP80:$lhs, RFP80:$rhs))]>, Requires<[HasCMov]>; } -let Defs = [FPSW], Uses = [ST0, FPCW] in { +let Uses = [ST0, FPCW] in { def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i) (outs), (ins RSTi:$reg), "fucom\t$reg">; def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop @@ -678,7 +698,7 @@ def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg), // Floating point flag ops. let SchedRW = [WriteALU] in { -let Defs = [AX], Uses = [FPSW] in +let Defs = [AX, FPSW], Uses = [FPSW] in def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))]>; @@ -694,51 +714,61 @@ def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] // FPU control instructions let SchedRW = [WriteMicrocoded] in { -let Defs = [FPSW] in { -def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>; def FFREE : FPI<0xDD, MRM0r, (outs), (ins RSTi:$reg), "ffree\t$reg">; def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RSTi:$reg), "ffreep\t$reg">; +let Defs = [FPSW, FPCW] in +def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>; // Clear exceptions +let Defs = [FPSW] in def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>; -} // Defs = [FPSW] } // SchedRW // Operand-less floating-point instructions for the disassembler. +let Defs = [FPSW] in def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", []>, Sched<[WriteNop]>; let SchedRW = [WriteMicrocoded] in { let Defs = [FPSW] in { def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>; def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>; +def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>; +def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>; +let Uses = [FPCW], mayRaiseFPException = 1 in { def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", []>; def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", []>; def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", []>; def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", []>; def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", []>; def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", []>; -def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>; -def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>; def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", []>; def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", []>; +def FSIN : I<0xD9, MRM_FE, (outs), (ins), "fsin", []>; +def FCOS : I<0xD9, MRM_FF, (outs), (ins), "fcos", []>; def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", []>; def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", []>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", []>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>; +} // Uses = [FPCW], mayRaiseFPException = 1 } // Defs = [FPSW] +let Uses = [FPSW, FPCW] in { def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst), "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB, Requires<[HasFXSR]>; def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst), "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>, TB, Requires<[HasFXSR, In64BitMode]>; +} // Uses = [FPSW, FPCW] + +let Defs = [FPSW, FPCW] in { def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src), "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>, TB, Requires<[HasFXSR]>; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src), "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>, TB, Requires<[HasFXSR, In64BitMode]>; +} // Defs = [FPSW, FPCW] } // SchedRW //===----------------------------------------------------------------------===// @@ -747,7 +777,10 @@ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src), // Required for RET of f32 / f64 / f80 values. def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>; +def : Pat<(X86fldf32 addr:$src), (LD_Fp32m64 addr:$src)>; def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>; +def : Pat<(X86fldf32 addr:$src), (LD_Fp32m80 addr:$src)>; +def : Pat<(X86fldf64 addr:$src), (LD_Fp64m80 addr:$src)>; def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>; // Required for CALL which return f32 / f64 / f80 values. @@ -775,19 +808,19 @@ def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>; // FP extensions map onto simple pseudo-value conversions if they are to/from // the FP stack. -def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, +def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, Requires<[FPStackf32]>; -def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, +def : Pat<(f80 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, Requires<[FPStackf32]>; -def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, +def : Pat<(f80 (any_fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, Requires<[FPStackf64]>; // FP truncations map onto simple pseudo-value conversions if they are to/from // the FP stack. We have validated that only value-preserving truncations make // it through isel. -def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, +def : Pat<(f32 (any_fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, Requires<[FPStackf32]>; -def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, +def : Pat<(f32 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, Requires<[FPStackf32]>; -def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, +def : Pat<(f64 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, Requires<[FPStackf64]>; diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index e8f0d937dff4..2f797fcfb8de 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -227,6 +227,7 @@ class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; } class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; } class NOTRACK { bit hasNoTrackPrefix = 1; } +class SIMD_EXC { list<Register> Uses = [MXCSR]; bit mayRaiseFPException = 1; } // Specify AVX512 8-bit compressed displacement encoding based on the vector // element size in bits (8, 16, 32, 64) and the CDisp8 form. @@ -441,12 +442,15 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, // FPStack Instruction Templates: // FPI - Floating Point Instruction template. class FPI<bits<8> o, Format F, dag outs, dag ins, string asm> - : I<o, F, outs, ins, asm, []> {} + : I<o, F, outs, ins, asm, []> { + let Defs = [FPSW]; +} // FpI_ - Floating Point Pseudo Instruction template. Not Predicated. class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern> : PseudoI<outs, ins, pattern> { let FPForm = fp; + let Defs = [FPSW]; } // Templates for instructions that use a 16- or 32-bit segmented address as diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index de6f8a81dff6..3250123e5aa6 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -127,11 +127,32 @@ def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, SDTCVecEltisVT<1, f32>, SDTCisSameSizeAs<0, 1>]>>; + +def X86strict_vfpext : SDNode<"X86ISD::STRICT_VFPEXT", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCisSameSizeAs<0, 1>]>, + [SDNPHasChain]>; + +def X86any_vfpext : PatFrags<(ops node:$src), + [(X86strict_vfpext node:$src), + (X86vfpext node:$src)]>; + def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, SDTCisOpSmallerThanOp<0, 1>]>>; +def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCisOpSmallerThanOp<0, 1>]>, + [SDNPHasChain]>; + +def X86any_vfpround : PatFrags<(ops node:$src), + [(X86strict_vfpround node:$src), + (X86vfpround node:$src)]>; + def X86frounds : SDNode<"X86ISD::VFPROUNDS", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCisSameAs<0, 1>, @@ -169,10 +190,15 @@ def X86vshiftimm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, def X86vshldq : SDNode<"X86ISD::VSHLDQ", X86vshiftimm>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", X86vshiftimm>; -def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; +def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; +def X86strict_cmpp : SDNode<"X86ISD::STRICT_CMPP", SDTX86VFCMP, [SDNPHasChain]>; +def X86any_cmpp : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_cmpp node:$src1, node:$src2, node:$src3), + (X86cmpp node:$src1, node:$src2, node:$src3)]>; + def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, SDTCisVec<1>, SDTCisSameAs<2, 1>, @@ -182,6 +208,10 @@ def X86CmpMaskCCScalar : SDTCisVT<3, i8>]>; def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>; +def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_cmpm node:$src1, node:$src2, node:$src3), + (X86cmpm node:$src1, node:$src2, node:$src3)]>; def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>; @@ -436,6 +466,12 @@ def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>; def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>; def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>; def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>; +def X86strict_VRndScale : SDNode<"X86ISD::STRICT_VRNDSCALE", SDTFPUnaryOpImm, + [SDNPHasChain]>; +def X86any_VRndScale : PatFrags<(ops node:$src1, node:$src2), + [(X86strict_VRndScale node:$src1, node:$src2), + (X86VRndScale node:$src1, node:$src2)]>; + def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>; def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>; def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>; @@ -493,7 +529,11 @@ def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>; def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>; def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>; -def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fmadd : SDNode<"ISD::STRICT_FMA", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fmadd node:$src1, node:$src2, node:$src3), + (X86Fmadd node:$src1, node:$src2, node:$src3)]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; @@ -621,9 +661,26 @@ def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>; // cvtt fp-to-int staff def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>; def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>; +def X86strict_cvttp2si : SDNode<"X86ISD::STRICT_CVTTP2SI", SDTFloatToInt, [SDNPHasChain]>; +def X86strict_cvttp2ui : SDNode<"X86ISD::STRICT_CVTTP2UI", SDTFloatToInt, [SDNPHasChain]>; +def X86any_cvttp2si : PatFrags<(ops node:$src), + [(X86strict_cvttp2si node:$src), + (X86cvttp2si node:$src)]>; +def X86any_cvttp2ui : PatFrags<(ops node:$src), + [(X86strict_cvttp2ui node:$src), + (X86cvttp2ui node:$src)]>; def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>; def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>; +def X86strict_VSintToFP : SDNode<"X86ISD::STRICT_CVTSI2P", SDTVintToFP, [SDNPHasChain]>; +def X86strict_VUintToFP : SDNode<"X86ISD::STRICT_CVTUI2P", SDTVintToFP, [SDNPHasChain]>; +def X86any_VSintToFP : PatFrags<(ops node:$src), + [(X86strict_VSintToFP node:$src), + (X86VSintToFP node:$src)]>; +def X86any_VUintToFP : PatFrags<(ops node:$src), + [(X86strict_VUintToFP node:$src), + (X86VUintToFP node:$src)]>; + // cvt int-to-fp staff def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; @@ -706,6 +763,10 @@ def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>; +def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2> +]>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -1040,9 +1101,10 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, INSERT_get_vinsert256_imm>; def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_ld node:$src1, node:$src2, node:$src3), [{ + (masked_ld node:$src1, undef, node:$src2, node:$src3), [{ return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && - cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast<MaskedLoadSDNode>(N)->isUnindexed(); }]>; def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -1055,17 +1117,19 @@ def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), }]>; def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_ld node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedLoadSDNode>(N)->isExpandingLoad(); + (masked_ld node:$src1, undef, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->isExpandingLoad() && + cast<MaskedLoadSDNode>(N)->isUnindexed(); }]>; // Masked store fragments. // X86mstore can't be implemented in core DAG files because some targets // do not support vector types (llvm-tblgen will fail). def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_st node:$src1, node:$src2, node:$src3), [{ - return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && - (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); + (masked_st node:$src1, node:$src2, undef, node:$src3), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() && + !cast<MaskedStoreSDNode>(N)->isCompressingStore() && + cast<MaskedStoreSDNode>(N)->isUnindexed(); }]>; def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -1078,16 +1142,18 @@ def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), }]>; def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_st node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedStoreSDNode>(N)->isCompressingStore(); + (masked_st node:$src1, node:$src2, undef, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isCompressingStore() && + cast<MaskedStoreSDNode>(N)->isUnindexed(); }]>; // masked truncstore fragments // X86mtruncstore can't be implemented in core DAG files because some targets // doesn't support vector type ( llvm-tblgen will fail) def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_st node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); + (masked_st node:$src1, node:$src2, undef, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && + cast<MaskedStoreSDNode>(N)->isUnindexed(); }]>; def masked_truncstorevi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), @@ -1111,10 +1177,10 @@ def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore, def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore, +def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTX86MaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore, +def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr), diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index c29029daeec9..245346d82731 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1761,10 +1761,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VCMPPSZ128rrik: case X86::VCMPPDZ256rrik: case X86::VCMPPSZ256rrik: { - unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f; + unsigned Imm = + MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f; Imm = X86::getSwappedVCMPImm(Imm); auto &WorkingMI = cloneIfNew(MI); - WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); + WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } @@ -2304,7 +2305,7 @@ unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) { default: llvm_unreachable("Illegal register size!"); case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr; case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr; - case 8: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV64rr; + case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr; } } @@ -2963,8 +2964,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc) const { + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = Subtarget.hasAVX(); bool HasVLX = Subtarget.hasVLX(); @@ -3046,15 +3047,11 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, report_fatal_error("Cannot emit physreg copy instruction"); } -bool X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI, - const MachineOperand *&Src, - const MachineOperand *&Dest) const { - if (MI.isMoveReg()) { - Dest = &MI.getOperand(0); - Src = &MI.getOperand(1); - return true; - } - return false; +Optional<DestSourcePair> +X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { + if (MI.isMoveReg()) + return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; + return None; } static unsigned getLoadStoreRegOpcode(unsigned Reg, @@ -3221,8 +3218,9 @@ bool X86InstrInfo::getMemOperandWithOffset( Offset = DispMO.getImm(); - assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " - "operands of type register."); + if (!BaseOp->isReg()) + return false; + return true; } @@ -3963,9 +3961,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, MachineFunction &MF = *MBB.getParent(); const X86FrameLowering *TFL = Subtarget.getFrameLowering(); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsDwarfCFI = - !IsWin64Prologue && - (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry()); + bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves(); bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; if (EmitCFI) { TFL->BuildCFI(MBB, I, DL, @@ -4708,6 +4704,10 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, updateOperandRegConstraints(MF, *NewMI, TII); + // Copy the NoFPExcept flag from the instruction we're fusing. + if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept)) + NewMI->setFlag(MachineInstr::MIFlag::NoFPExcept); + MachineBasicBlock *MBB = InsertPt->getParent(); MBB->insert(InsertPt, NewMI); @@ -7233,8 +7233,8 @@ bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel, bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const { - assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) && - "Reassociation needs binary operators"); + assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 && + Inst.getNumDefs() <= 2 && "Reassociation needs binary operators"); // Integer binary math/logic instructions have a third source operand: // the EFLAGS register. That operand must be both defined here and never @@ -7242,13 +7242,11 @@ bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, // not change anything because rearranging the operands could affect other // instructions that depend on the exact status flags (zero, sign, etc.) // that are set by using these particular operands with this operation. - if (Inst.getNumOperands() == 4) { - assert(Inst.getOperand(3).isReg() && - Inst.getOperand(3).getReg() == X86::EFLAGS && - "Unexpected operand in reassociable instruction"); - if (!Inst.getOperand(3).isDead()) - return false; - } + const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS); + assert((Inst.getNumDefs() == 1 || FlagDef) && + "Implicit def isn't flags?"); + if (FlagDef && !FlagDef->isDead()) + return false; return TargetInstrInfo::hasReassociableOperands(Inst, MBB); } @@ -7558,15 +7556,57 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { } } +/// If \p DescribedReg overlaps with the MOVrr instruction's destination +/// register then, if possible, describe the value in terms of the source +/// register. +static Optional<ParamLoadedValue> +describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, + const TargetRegisterInfo *TRI) { + Register DestReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + + auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); + + // If the described register is the destination, just return the source. + if (DestReg == DescribedReg) + return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); + + // If the described register is a sub-register of the destination register, + // then pick out the source register's corresponding sub-register. + if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) { + unsigned SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx); + return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); + } + + // The remaining case to consider is when the described register is a + // super-register of the destination register. MOV8rr and MOV16rr does not + // write to any of the other bytes in the register, meaning that we'd have to + // describe the value using a combination of the source register and the + // non-overlapping bits in the described register, which is not currently + // possible. + if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr || + !TRI->isSuperRegister(DestReg, DescribedReg)) + return None; + + assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case"); + return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); +} + Optional<ParamLoadedValue> -X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const { +X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { const MachineOperand *Op = nullptr; DIExpression *Expr = nullptr; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + switch (MI.getOpcode()) { case X86::LEA32r: case X86::LEA64r: case X86::LEA64_32r: { + // We may need to describe a 64-bit parameter with a 32-bit LEA. + if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) + return None; + // Operand 4 could be global address. For now we do not support // such situation. if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm()) @@ -7574,7 +7614,6 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const { const MachineOperand &Op1 = MI.getOperand(1); const MachineOperand &Op2 = MI.getOperand(3); - const TargetRegisterInfo *TRI = &getRegisterInfo(); assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister || Register::isPhysicalRegister(Op2.getReg()))); @@ -7638,13 +7677,56 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const { return ParamLoadedValue(*Op, Expr);; } + case X86::MOV32ri: + case X86::MOV64ri: + case X86::MOV64ri32: + // MOV32ri may be used for producing zero-extended 32-bit immediates in + // 64-bit parameters, so we need to consider super-registers. + if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) + return None; + return ParamLoadedValue(MI.getOperand(1), Expr); + case X86::MOV8rr: + case X86::MOV16rr: + case X86::MOV32rr: + case X86::MOV64rr: + return describeMOVrrLoadedValue(MI, Reg, TRI); case X86::XOR32rr: { + // 64-bit parameters are zero-materialized using XOR32rr, so also consider + // super-registers. + if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) + return None; if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) return ParamLoadedValue(MachineOperand::CreateImm(0), Expr); return None; } + case X86::MOVSX64rr32: { + // We may need to describe the lower 32 bits of the MOVSX; for example, in + // cases like this: + // + // $ebx = [...] + // $rdi = MOVSX64rr32 $ebx + // $esi = MOV32rr $edi + if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg)) + return None; + + Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); + + // If the described register is the destination register we need to + // sign-extend the source register from 32 bits. The other case we handle + // is when the described register is the 32-bit sub-register of the + // destination register, in case we just need to return the source + // register. + if (Reg == MI.getOperand(0).getReg()) + Expr = DIExpression::appendExt(Expr, 32, 64, true); + else + assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) && + "Unhandled sub-register case for MOVSX64rr32"); + + return ParamLoadedValue(MI.getOperand(1), Expr); + } default: - return TargetInstrInfo::describeLoadedValue(MI); + assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction"); + return TargetInstrInfo::describeLoadedValue(MI, Reg); } } @@ -7654,38 +7736,31 @@ void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const { - // Integer instructions define an implicit EFLAGS source register operand as - // the third source (fourth total) operand. - if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4) - return; + // Integer instructions may define an implicit EFLAGS dest register operand. + MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS); + MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS); - assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 && + assert(!OldFlagDef1 == !OldFlagDef2 && "Unexpected instruction type for reassociation"); - MachineOperand &OldOp1 = OldMI1.getOperand(3); - MachineOperand &OldOp2 = OldMI2.getOperand(3); - MachineOperand &NewOp1 = NewMI1.getOperand(3); - MachineOperand &NewOp2 = NewMI2.getOperand(3); + if (!OldFlagDef1 || !OldFlagDef2) + return; - assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() && - "Must have dead EFLAGS operand in reassociable instruction"); - assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() && + assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() && "Must have dead EFLAGS operand in reassociable instruction"); - (void)OldOp1; - (void)OldOp2; + MachineOperand *NewFlagDef1 = NewMI1.findRegisterDefOperand(X86::EFLAGS); + MachineOperand *NewFlagDef2 = NewMI2.findRegisterDefOperand(X86::EFLAGS); - assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS && - "Unexpected operand in reassociable instruction"); - assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS && + assert(NewFlagDef1 && NewFlagDef2 && "Unexpected operand in reassociable instruction"); // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations // of this pass or other passes. The EFLAGS operands must be dead in these new // instructions because the EFLAGS operands in the original instructions must // be dead in order for reassociation to occur. - NewOp1.setIsDead(); - NewOp2.setIsDead(); + NewFlagDef1->setIsDead(); + NewFlagDef2->setIsDead(); } std::pair<unsigned, unsigned> diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 22b7b1d4cb19..1d2da5305357 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -312,7 +312,7 @@ public: ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, @@ -522,8 +522,8 @@ public: return MI.getDesc().TSFlags & X86II::LOCK; } - Optional<ParamLoadedValue> - describeLoadedValue(const MachineInstr &MI) const override; + Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI, + Register Reg) const override; protected: /// Commutes the operands in the given instruction by changing the operands @@ -542,10 +542,10 @@ protected: unsigned CommuteOpIdx2) const override; /// If the specific machine instruction is a instruction that moves/copies - /// value from one register to another register return true along with - /// @Source machine operand and @Destination machine operand. - bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source, - const MachineOperand *&Destination) const override; + /// value from one register to another register return destination and source + /// registers as machine operands. + Optional<DestSourcePair> + isCopyInstrImpl(const MachineInstr &MI) const override; private: /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions. diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index e452145f3b65..ca5425e8b89f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -142,6 +142,8 @@ def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; +def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86CmpTest, [SDNPHasChain]>; +def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86CmpTest, [SDNPHasChain]>; def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; @@ -375,6 +377,9 @@ class X86VMemOperand<RegisterClass RC, string printMethod, } def anymem : X86MemOperand<"printanymem">; +def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), + [(X86strict_fcmp node:$lhs, node:$rhs), + (X86cmp node:$lhs, node:$rhs)]>; // FIXME: Right now we allow any size during parsing, but we might want to // restrict to only unsized memory. @@ -449,18 +454,6 @@ def i64mem_TC : Operand<i64> { let OperandType = "OPERAND_MEMORY"; } -let OperandType = "OPERAND_PCREL", - ParserMatchClass = X86AbsMemAsmOperand, - PrintMethod = "printPCRelImm" in { -def i32imm_pcrel : Operand<i32>; -def i16imm_pcrel : Operand<i16>; - -// Branch targets have OtherVT type and print as pc-relative values. -def brtarget : Operand<OtherVT>; -def brtarget8 : Operand<OtherVT>; - -} - // Special parser to detect 16-bit mode to select 16-bit displacement. def X86AbsMem16AsmOperand : AsmOperandClass { let Name = "AbsMem16"; @@ -468,15 +461,27 @@ def X86AbsMem16AsmOperand : AsmOperandClass { let SuperClasses = [X86AbsMemAsmOperand]; } -// Branch targets have OtherVT type and print as pc-relative values. -let OperandType = "OPERAND_PCREL", - PrintMethod = "printPCRelImm" in { -let ParserMatchClass = X86AbsMem16AsmOperand in - def brtarget16 : Operand<OtherVT>; -let ParserMatchClass = X86AbsMemAsmOperand in - def brtarget32 : Operand<OtherVT>; +// Branch targets print as pc-relative values. +class BranchTargetOperand<ValueType ty> : Operand<ty> { + let OperandType = "OPERAND_PCREL"; + let PrintMethod = "printPCRelImm"; + let ParserMatchClass = X86AbsMemAsmOperand; } +def i32imm_brtarget : BranchTargetOperand<i32>; +def i16imm_brtarget : BranchTargetOperand<i16>; + +// 64-bits but only 32 bits are significant, and those bits are treated as being +// pc relative. +def i64i32imm_brtarget : BranchTargetOperand<i64>; + +def brtarget : BranchTargetOperand<OtherVT>; +def brtarget8 : BranchTargetOperand<OtherVT>; +def brtarget16 : BranchTargetOperand<OtherVT> { + let ParserMatchClass = X86AbsMem16AsmOperand; +} +def brtarget32 : BranchTargetOperand<OtherVT>; + let RenderMethod = "addSrcIdxOperands" in { def X86SrcIdx8Operand : AsmOperandClass { let Name = "SrcIdx8"; @@ -751,14 +756,6 @@ def i64u8imm : Operand<i64> { let OperandType = "OPERAND_IMMEDIATE"; } -// 64-bits but only 32 bits are significant, and those bits are treated as being -// pc relative. -def i64i32imm_pcrel : Operand<i64> { - let PrintMethod = "printPCRelImm"; - let ParserMatchClass = X86AbsMemAsmOperand; - let OperandType = "OPERAND_PCREL"; -} - def lea64_32mem : Operand<i32> { let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); @@ -983,12 +980,12 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; // the Function object through the <Target>Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def OptForSize : Predicate<"MF->getFunction().hasOptSize()">; + def OptForSize : Predicate<"shouldOptForSize(MF)">; def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">; - def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">; + def OptForSpeed : Predicate<"!shouldOptForSize(MF)">; def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " - "MF->getFunction().hasOptSize()">; - def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || " + "shouldOptForSize(MF)">; + def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || " "!Subtarget->hasSSE41()">; } @@ -2846,7 +2843,7 @@ let SchedRW = [WriteStore], Defs = [EFLAGS] in { //===----------------------------------------------------------------------===// // CLZERO Instruction // -let SchedRW = [WriteSystem] in { +let SchedRW = [WriteLoad] in { let Uses = [EAX] in def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB, Requires<[HasCLZERO, Not64BitMode]>; diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index cd9a866c91cb..0f4d4d764cc9 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -508,16 +508,16 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, // -- Conversion Instructions defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", - WriteCvtPS2I, SSEPackedSingle>, PS; + WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC; defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi, f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}", - WriteCvtPD2I, SSEPackedDouble>, PD; + WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC; defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi, f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}", - WriteCvtPS2I, SSEPackedSingle>, PS; + WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC; defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi, f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}", - WriteCvtPD2I, SSEPackedDouble>, PD; + WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC; defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd, i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}", WriteCvtI2PD, SSEPackedDouble>, PD; @@ -525,7 +525,7 @@ let Constraints = "$src1 = $dst" in { defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128, int_x86_sse_cvtpi2ps, i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, PS; + SSEPackedSingle>, PS, SIMD_EXC; } // Extract / Insert diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 09a04c0338b4..c45f342ed75b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -823,7 +823,9 @@ let Constraints = "$src1 = $dst" in { multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm, string mem, X86FoldableSchedWrite sched, + Domain d, SchedRead Int2Fpu = ReadDefault> { + let ExeDomain = d in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpNode SrcRC:$src))]>, @@ -832,18 +834,19 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, mem#"\t{$src, $dst|$dst, $src}", [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, Sched<[sched.Folded]>; + } } multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, string asm, Domain d, X86FoldableSchedWrite sched> { -let hasSideEffects = 0 in { +let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, - [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>, + [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, Sched<[sched]>; let mayLoad = 1 in def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, - [(set RC:$dst, (DstTy (sint_to_fp + [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy (ld_frag addr:$src)))))], d>, Sched<[sched.Folded]>; } @@ -851,8 +854,8 @@ let hasSideEffects = 0 in { multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm, string mem, - X86FoldableSchedWrite sched> { -let hasSideEffects = 0, Predicates = [UseAVX] in { + X86FoldableSchedWrite sched, Domain d> { +let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; @@ -864,22 +867,22 @@ let hasSideEffects = 0, Predicates = [UseAVX] in { } // hasSideEffects = 0 } -let isCodeGenOnly = 1, Predicates = [UseAVX] in { -defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, +let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", - WriteCvtSS2I>, + WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; -defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, +defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", - WriteCvtSS2I>, + WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; -defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", - WriteCvtSD2I>, + WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; -defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, +defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", - WriteCvtSD2I>, + WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; } @@ -889,60 +892,64 @@ defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, // where appropriate to do so. let isCodeGenOnly = 1 in { defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", - WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; + WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, + VEX_LIG, SIMD_EXC; defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", - WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; + WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, + VEX_W, VEX_LIG, SIMD_EXC; defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", - WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; + WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, + VEX_LIG; defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", - WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; + WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, + VEX_W, VEX_LIG, SIMD_EXC; } // isCodeGenOnly = 1 let Predicates = [UseAVX] in { - def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; - def : Pat<(f32 (sint_to_fp GR32:$src)), + def : Pat<(f32 (any_sint_to_fp GR32:$src)), (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; - def : Pat<(f32 (sint_to_fp GR64:$src)), + def : Pat<(f32 (any_sint_to_fp GR64:$src)), (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; - def : Pat<(f64 (sint_to_fp GR32:$src)), + def : Pat<(f64 (any_sint_to_fp GR32:$src)), (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; - def : Pat<(f64 (sint_to_fp GR64:$src)), + def : Pat<(f64 (any_sint_to_fp GR64:$src)), (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } let isCodeGenOnly = 1 in { -defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, +defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", - WriteCvtSS2I>, XS; -defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; +defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", - WriteCvtSS2I>, XS, REX_W; -defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; +defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", - WriteCvtSD2I>, XD; -defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; +defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", - WriteCvtSD2I>, XD, REX_W; -defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, + WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; +defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, "cvtsi2ss", "cvtsi2ss{l}", - WriteCvtI2SS, ReadInt2Fpu>, XS; -defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, + WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; +defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, "cvtsi2ss", "cvtsi2ss{q}", - WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W; -defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, + WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; +defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, "cvtsi2sd", "cvtsi2sd{l}", - WriteCvtI2SD, ReadInt2Fpu>, XD; -defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, + WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; +defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, "cvtsi2sd", "cvtsi2sd{q}", - WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W; + WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; } // isCodeGenOnly = 1 // Conversion Instructions Intrinsics - Match intrinsics which expect MM @@ -951,7 +958,8 @@ defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, ValueType DstVT, ValueType SrcVT, SDNode OpNode, Operand memop, ComplexPattern mem_cpat, string asm, - X86FoldableSchedWrite sched> { + X86FoldableSchedWrite sched, Domain d> { +let ExeDomain = d in { def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, @@ -961,12 +969,13 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>, Sched<[sched.Folded]>; } +} multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm, string mem, X86FoldableSchedWrite sched, - bit Is2Addr = 1> { -let hasSideEffects = 0 in { + Domain d, bit Is2Addr = 1> { +let hasSideEffects = 0, ExeDomain = d in { def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), @@ -982,39 +991,50 @@ let hasSideEffects = 0 in { } } +let Uses = [MXCSR], mayRaiseFPException = 1 in { let Predicates = [UseAVX] in { defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", - WriteCvtSD2I>, XD, VEX, VEX_LIG; + WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", - WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; + WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; } defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, - sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD; + sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, + SSEPackedDouble>, XD; defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, - sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; - + sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, + SSEPackedDouble>, XD, REX_W; +} let Predicates = [UseAVX] in { defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG; + i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, + XS, VEX_4V, VEX_LIG, SIMD_EXC; defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W; + i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, + XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG; + i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, + XD, VEX_4V, VEX_LIG; defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W; + i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, + XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; } let Constraints = "$src1 = $dst" in { defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS; + i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, + XS, SIMD_EXC; defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W; + i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, + XS, REX_W, SIMD_EXC; defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD; + i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, + XD; defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W; + i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, + XD, REX_W, SIMD_EXC; } def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1048,34 +1068,38 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", /// SSE 1 Only // Aliases for intrinsics -let Predicates = [UseAVX] in { +let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", - WriteCvtSS2I>, XS, VEX, VEX_LIG; + WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, X86cvtts2Int, ssmem, sse_load_f32, - "cvttss2si", WriteCvtSS2I>, + "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG, VEX_W; defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", - WriteCvtSS2I>, XD, VEX, VEX_LIG; + WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG; defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, X86cvtts2Int, sdmem, sse_load_f64, - "cvttsd2si", WriteCvtSS2I>, + "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG, VEX_W; } +let Uses = [MXCSR], mayRaiseFPException = 1 in { defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, ssmem, sse_load_f32, "cvttss2si", - WriteCvtSS2I>, XS; + WriteCvtSS2I, SSEPackedSingle>, XS; defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, X86cvtts2Int, ssmem, sse_load_f32, - "cvttss2si", WriteCvtSS2I>, XS, REX_W; + "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, + XS, REX_W; defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, sdmem, sse_load_f64, "cvttsd2si", - WriteCvtSD2I>, XD; + WriteCvtSD2I, SSEPackedDouble>, XD; defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, X86cvtts2Int, sdmem, sse_load_f64, - "cvttsd2si", WriteCvtSD2I>, XD, REX_W; + "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, + XD, REX_W; +} def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; @@ -1111,20 +1135,21 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; -let Predicates = [UseAVX] in { +let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", - WriteCvtSS2I>, XS, VEX, VEX_LIG; + WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", - WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; + WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; } +let Uses = [MXCSR], mayRaiseFPException = 1 in { defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", - WriteCvtSS2I>, XS; + WriteCvtSS2I, SSEPackedSingle>, XS; defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, ssmem, sse_load_f32, "cvtss2si", - WriteCvtSS2I>, XS, REX_W; + WriteCvtSS2I, SSEPackedSingle>, XS, REX_W; defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, "vcvtdq2ps\t{$src, $dst|$dst, $src}", @@ -1139,6 +1164,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, "cvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, WriteCvtI2PS>, PS, Requires<[UseSSE2]>; +} // AVX aliases def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", @@ -1184,31 +1210,32 @@ def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX_4V, VEX_LIG, VEX_WIG, - Sched<[WriteCvtSD2SS]>; + Sched<[WriteCvtSD2SS]>, SIMD_EXC; let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, VEX_4V, VEX_LIG, VEX_WIG, - Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; + Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; } -def : Pat<(f32 (fpround FR64:$src)), +def : Pat<(f32 (any_fpround FR64:$src)), (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; let isCodeGenOnly = 1 in { def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fpround FR64:$src))]>, - Sched<[WriteCvtSD2SS]>; + [(set FR32:$dst, (any_fpround FR64:$src))]>, + Sched<[WriteCvtSD2SS]>, SIMD_EXC; def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, + [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, XD, Requires<[UseSSE2, OptForSize]>, - Sched<[WriteCvtSD2SS.Folded]>; + Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC; } +let Uses = [MXCSR], mayRaiseFPException = 1 in { def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1238,6 +1265,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; } +} // Convert scalar single to scalar double // SSE2 instructions with XS prefix @@ -1246,34 +1274,34 @@ def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, VEX_LIG, VEX_WIG, - Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>; + Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, - Requires<[UseAVX, OptForSize]>; + Requires<[UseAVX, OptForSize]>, SIMD_EXC; } // isCodeGenOnly = 1, hasSideEffects = 0 -def : Pat<(f64 (fpextend FR32:$src)), +def : Pat<(f64 (any_fpextend FR32:$src)), (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; -def : Pat<(fpextend (loadf32 addr:$src)), +def : Pat<(any_fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; let isCodeGenOnly = 1 in { def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (fpextend FR32:$src))]>, - XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; + [(set FR64:$dst, (any_fpextend FR32:$src))]>, + XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>, + [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, XS, Requires<[UseSSE2, OptForSize]>, - Sched<[WriteCvtSS2SD.Folded]>; + Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC; } // isCodeGenOnly = 1 -let hasSideEffects = 0 in { +let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -1307,53 +1335,53 @@ let Predicates = [UseAVX] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; } // Predicates = [UseAVX] @@ -1361,55 +1389,55 @@ let Predicates = [UseSSE2] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; } // Predicates = [UseSSE2] let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; } // Predicates = [UseSSE1] @@ -1418,36 +1446,36 @@ let Predicates = [HasAVX, NoVLX] in { def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, - VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; + VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, - VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; + VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC; def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, - VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; + VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, - VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG; + VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC; } def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, - Sched<[WriteCvtPS2I]>; + Sched<[WriteCvtPS2I]>, SIMD_EXC; def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, - Sched<[WriteCvtPS2ILd]>; + Sched<[WriteCvtPS2ILd]>, SIMD_EXC; // Convert Packed Double FP to Packed DW Integers -let Predicates = [HasAVX, NoVLX] in { +let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -1486,35 +1514,36 @@ def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, - Sched<[WriteCvtPD2ILd]>; + Sched<[WriteCvtPD2ILd]>, SIMD_EXC; def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, - Sched<[WriteCvtPD2I]>; + Sched<[WriteCvtPD2I]>, SIMD_EXC; // Convert with truncation packed single/double fp to doubleword // SSE2 packed instructions with XS prefix +let Uses = [MXCSR], mayRaiseFPException = 1 in { let Predicates = [HasAVX, NoVLX] in { def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, + (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>, + (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>, + (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>, + (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG; } @@ -1522,40 +1551,41 @@ def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src) def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, + (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, Sched<[WriteCvtPS2I]>; def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, + (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, Sched<[WriteCvtPS2ILd]>; +} // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. -let Predicates = [HasAVX, NoVLX] in { +let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { // XMM only def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, + (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, + (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; // YMM only def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>, + (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, + (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; } // Predicates = [HasAVX, NoVLX] @@ -1565,29 +1595,29 @@ def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), + def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), (VCVTTPD2DQYrm addr:$src)>; } def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, - Sched<[WriteCvtPD2I]>; + (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, + Sched<[WriteCvtPD2I]>, SIMD_EXC; def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, - Sched<[WriteCvtPD2ILd]>; + (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, + Sched<[WriteCvtPD2ILd]>, SIMD_EXC; // Convert packed single to packed double -let Predicates = [HasAVX, NoVLX] in { +let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, + [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", @@ -1595,7 +1625,7 @@ def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>, + [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", @@ -1603,10 +1633,10 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; } -let Predicates = [UseSSE2] in { +let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, + [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, PS, Sched<[WriteCvtPS2PD]>; def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", @@ -1620,7 +1650,7 @@ let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP + (v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))))]>, @@ -1628,18 +1658,18 @@ def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, + (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>, + (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, VEX_WIG; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, - (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, + (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; } @@ -1647,7 +1677,7 @@ let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP + (v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))))]>, @@ -1655,18 +1685,18 @@ def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, + (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, Sched<[WriteCvtI2PD]>; // AVX register conversion intrinsics let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDrm addr:$src)>; } // Predicates = [HasAVX, NoVLX] // SSE2 register conversion intrinsics let Predicates = [UseSSE2] in { - def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (CVTDQ2PDrm addr:$src)>; } // Predicates = [UseSSE2] @@ -1674,24 +1704,24 @@ let Predicates = [UseSSE2] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. -let Predicates = [HasAVX, NoVLX] in { +let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { // XMM only def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, + [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, + [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>, VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86vfpround VR256:$src))]>, + [(set VR128:$dst, (X86any_vfpround VR256:$src))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>, + [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>, VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; } // Predicates = [HasAVX, NoVLX] @@ -1702,19 +1732,12 @@ def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, - Sched<[WriteCvtPD2PS]>; + [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, + Sched<[WriteCvtPD2PS]>, SIMD_EXC; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, - Sched<[WriteCvtPD2PS.Folded]>; - -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4f32 (fpround (v4f64 VR256:$src))), - (VCVTPD2PSYrr VR256:$src)>; - def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), - (VCVTPD2PSYrm addr:$src)>; -} + [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>, + Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Compare Instructions @@ -1725,6 +1748,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, X86FoldableSchedWrite sched> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, @@ -1736,6 +1760,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, (ld_frag addr:$src2), timm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} let isCodeGenOnly = 1 in { let ExeDomain = SSEPackedSingle in @@ -1763,6 +1788,7 @@ let isCodeGenOnly = 1 in { multiclass sse12_cmp_scalar_int<Operand memop, Intrinsic Int, string asm, X86FoldableSchedWrite sched, ComplexPattern mem_cpat> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, @@ -1775,6 +1801,7 @@ let mayLoad = 1 in mem_cpat:$src, timm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} // Aliases to match intrinsics which expect XMM operand(s). let ExeDomain = SSEPackedSingle in @@ -1802,9 +1829,10 @@ let Constraints = "$src1 = $dst" in { // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, - PatFrag ld_frag, string OpcodeStr, - X86FoldableSchedWrite sched> { -let hasSideEffects = 0 in { + PatFrag ld_frag, string OpcodeStr, Domain d, + X86FoldableSchedWrite sched = WriteFCom> { +let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, + ExeDomain = d in { def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, @@ -1822,7 +1850,9 @@ let mayLoad = 1 in multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, Operand memop, ComplexPattern mem_cpat, string OpcodeStr, - X86FoldableSchedWrite sched> { + Domain d, + X86FoldableSchedWrite sched = WriteFCom> { +let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in { def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, @@ -1834,52 +1864,48 @@ let mayLoad = 1 in mem_cpat:$src2))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} let Defs = [EFLAGS] in { - defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; - defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; - let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; - defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; - } + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; + defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, + "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; + defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, + "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; let isCodeGenOnly = 1 in { defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; + sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; + sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; + sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; - } - defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", WriteFCom>, PS; - defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", WriteFCom>, PD; - - let Pattern = []<dag> in { - defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss", WriteFCom>, PS; - defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd", WriteFCom>, PD; + sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; } + defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, PS; + defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, PD; + defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, + "comiss", SSEPackedSingle>, PS; + defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, + "comisd", SSEPackedDouble>, PD; let isCodeGenOnly = 1 in { defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss", WriteFCom>, PS; + sse_load_f32, "ucomiss", SSEPackedSingle>, PS; defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd", WriteFCom>, PD; + sse_load_f64, "ucomisd", SSEPackedDouble>, PD; defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss", WriteFCom>, PS; + sse_load_f32, "comiss", SSEPackedSingle>, PS; defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd", WriteFCom>, PD; + sse_load_f64, "comisd", SSEPackedDouble>, PD; } } // Defs = [EFLAGS] @@ -1888,17 +1914,19 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, ValueType VT, string asm, X86FoldableSchedWrite sched, Domain d, PatFrag ld_frag> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, - [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, + [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, Sched<[sched]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, - (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, + (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", @@ -1928,20 +1956,20 @@ def CommutableCMPCC : PatLeaf<(timm), [{ // Patterns to select compares with loads in first operand. let Predicates = [HasAVX] in { - def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, - CommutableCMPCC:$cc)), + def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, + CommutableCMPCC:$cc)), (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; - def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, - CommutableCMPCC:$cc)), + def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, + CommutableCMPCC:$cc)), (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; - def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, - CommutableCMPCC:$cc)), + def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; - def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, - CommutableCMPCC:$cc)), + def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, @@ -1954,8 +1982,8 @@ let Predicates = [HasAVX] in { } let Predicates = [UseSSE2] in { - def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, - CommutableCMPCC:$cc)), + def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, @@ -1964,8 +1992,8 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE1] in { - def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, - CommutableCMPCC:$cc)), + def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, @@ -2555,6 +2583,7 @@ def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), /// classes below multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteSizes sched> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, @@ -2580,9 +2609,11 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, sched.PD.XMM>, PD; } } +} multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteSizes sched> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; @@ -2599,10 +2630,12 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, sched.PD.Scl>, XD; } } +} multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, X86SchedWriteSizes sched> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; @@ -2619,20 +2652,21 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, SSEPackedDouble, sched.PD.Scl>, XD; } } +} // Binary Arithmetic instructions -defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>, - basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>, +defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, + basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; -defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>, - basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>, +defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, + basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; let isCommutable = 0 in { - defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>, - basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>, + defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, + basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; - defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>, - basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>, + defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, + basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, @@ -2727,15 +2761,15 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, } } -defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; -defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; -defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; -defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; -defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; -defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; -defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; -defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to @@ -2961,10 +2995,10 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>, - sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>; +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, + sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, + sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, + sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. @@ -2993,8 +3027,8 @@ multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Mo } } -defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; -defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; +defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, SDNode Move, ValueType VT, @@ -4436,6 +4470,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, X86FoldableSchedWrite sched, PatFrag ld_frag, bit Is2Addr = 1> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : I<0xD0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, @@ -4451,6 +4486,7 @@ multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { @@ -4488,6 +4524,7 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, X86FoldableSchedWrite sched, PatFrag ld_frag, bit Is2Addr = 1> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -4502,10 +4539,12 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, SDNode OpNode, X86FoldableSchedWrite sched, PatFrag ld_frag, bit Is2Addr = 1> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), @@ -4520,6 +4559,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } +} let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { @@ -5348,6 +5388,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched> { // Intrinsic operation, reg. // Vector intrinsic operation, reg +let Uses = [MXCSR], mayRaiseFPException = 1 in { def r : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -5364,6 +5405,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, Sched<[sched.Folded]>; } +} multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { @@ -5400,6 +5442,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), @@ -5430,11 +5473,13 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, hasSideEffects = 0 } +} multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched, ValueType VT32, ValueType VT64, SDNode OpNode, bit Is2Addr = 1> { +let Uses = [MXCSR], mayRaiseFPException = 1 in { let ExeDomain = SSEPackedSingle in { def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), @@ -5481,56 +5526,57 @@ let ExeDomain = SSEPackedDouble in { Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 } +} // FP round - roundss, roundps, roundsd, roundpd let Predicates = [HasAVX, NoVLX] in { - let ExeDomain = SSEPackedSingle in { + let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { // Intrinsic form defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, - loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>, + loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, VEX, VEX_WIG; defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, - loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>, + loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, VEX, VEX_L, VEX_WIG; } - let ExeDomain = SSEPackedDouble in { + let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, - loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>, + loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, VEX, VEX_WIG; defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, - loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>, + loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, VEX, VEX_L, VEX_WIG; } } let Predicates = [UseAVX] in { defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales, 0>, - VEX_4V, VEX_LIG, VEX_WIG; + VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, - VEX_4V, VEX_LIG, VEX_WIG; + VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; } let Predicates = [UseAVX] in { - def : Pat<(X86VRndScale FR32:$src1, timm:$src2), + def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; - def : Pat<(X86VRndScale FR64:$src1, timm:$src2), + def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; } let Predicates = [UseAVX, OptForSize] in { - def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2), + def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; - def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2), + def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; } let ExeDomain = SSEPackedSingle in defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, - memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>; + memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; let ExeDomain = SSEPackedDouble in defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, - memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>; + memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; @@ -5539,16 +5585,16 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { - def : Pat<(X86VRndScale FR32:$src1, timm:$src2), + def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), (ROUNDSSr FR32:$src1, timm:$src2)>; - def : Pat<(X86VRndScale FR64:$src1, timm:$src2), + def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), (ROUNDSDr FR64:$src1, timm:$src2)>; } let Predicates = [UseSSE41, OptForSize] in { - def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2), + def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), (ROUNDSSm addr:$src1, timm:$src2)>; - def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2), + def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), (ROUNDSDm addr:$src1, timm:$src2)>; } @@ -5959,6 +6005,7 @@ let Predicates = [HasAVX] in { SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; } +let Uses = [MXCSR], mayRaiseFPException = 1 in { let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, load, f128mem, 0, @@ -5972,6 +6019,7 @@ let Predicates = [HasAVX] in { VR256, load, i256mem, 0, SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; } +} let Predicates = [HasAVX2] in { let isCommutable = 0 in { @@ -5991,11 +6039,11 @@ let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memop, f128mem, 1, - SchedWriteDPPS.XMM>; + SchedWriteDPPS.XMM>, SIMD_EXC; let ExeDomain = SSEPackedDouble in defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, VR128, memop, f128mem, 1, - SchedWriteDPPD.XMM>; + SchedWriteDPPD.XMM>, SIMD_EXC; } /// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate @@ -7266,12 +7314,12 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, } let Predicates = [HasF16C, NoVLX] in { - defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>; - defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L; + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, - WriteCvtPS2PHSt>; + WriteCvtPS2PHSt>, SIMD_EXC; defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, - WriteCvtPS2PHYSt>, VEX_L; + WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; // Pattern match vcvtph2ps of a scalar i64 load. def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td index 3a1212342a13..41b839425ccd 100644 --- a/llvm/lib/Target/X86/X86InstrTSX.td +++ b/llvm/lib/Target/X86/X86InstrTSX.td @@ -31,7 +31,7 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), "xbegin\t$dst", []>, OpSize32; } -// Psuedo instruction to fake the definition of EAX on the fallback code path. +// Pseudo instruction to fake the definition of EAX on the fallback code path. let isPseudo = 1, Defs = [EAX] in { def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>; } diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp index 01620b7b64c9..3f9d626ff912 100644 --- a/llvm/lib/Target/X86/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp @@ -34,6 +34,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" @@ -111,8 +112,6 @@ private: bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const; - bool selectShift(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; bool selectDivRem(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectIntrinsicWSideEffects(MachineInstr &I, MachineRegisterInfo &MRI, @@ -342,7 +341,7 @@ bool X86InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_STORE: case TargetOpcode::G_LOAD: return selectLoadStoreOp(I, MRI, MF); - case TargetOpcode::G_GEP: + case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_FRAME_INDEX: return selectFrameIndexOrGep(I, MRI, MF); case TargetOpcode::G_GLOBAL_VALUE: @@ -380,10 +379,6 @@ bool X86InstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_IMPLICIT_DEF: case TargetOpcode::G_PHI: return selectImplicitDefOrPHI(I, MRI); - case TargetOpcode::G_SHL: - case TargetOpcode::G_ASHR: - case TargetOpcode::G_LSHR: - return selectShift(I, MRI, MF); case TargetOpcode::G_SDIV: case TargetOpcode::G_UDIV: case TargetOpcode::G_SREM: @@ -482,7 +477,7 @@ static void X86SelectAddress(const MachineInstr &I, assert(MRI.getType(I.getOperand(0).getReg()).isPointer() && "unsupported type."); - if (I.getOpcode() == TargetOpcode::G_GEP) { + if (I.getOpcode() == TargetOpcode::G_PTR_ADD) { if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) { int64_t Imm = *COff; if (isInt<32>(Imm)) { // Check for displacement overflow. @@ -566,7 +561,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, MachineFunction &MF) const { unsigned Opc = I.getOpcode(); - assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) && + assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_PTR_ADD) && "unexpected instruction"); const Register DefReg = I.getOperand(0).getReg(); @@ -1225,7 +1220,7 @@ bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg, if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { - LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); + LLVM_DEBUG(dbgs() << "Failed to constrain EXTRACT_SUBREG\n"); return false; } @@ -1519,78 +1514,6 @@ bool X86InstructionSelector::selectImplicitDefOrPHI( return true; } -// Currently GlobalIsel TableGen generates patterns for shift imm and shift 1, -// but with shiftCount i8. In G_LSHR/G_ASHR/G_SHL like LLVM-IR both arguments -// has the same type, so for now only shift i8 can use auto generated -// TableGen patterns. -bool X86InstructionSelector::selectShift(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - - assert((I.getOpcode() == TargetOpcode::G_SHL || - I.getOpcode() == TargetOpcode::G_ASHR || - I.getOpcode() == TargetOpcode::G_LSHR) && - "unexpected instruction"); - - Register DstReg = I.getOperand(0).getReg(); - const LLT DstTy = MRI.getType(DstReg); - const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); - - const static struct ShiftEntry { - unsigned SizeInBits; - unsigned OpLSHR; - unsigned OpASHR; - unsigned OpSHL; - } OpTable[] = { - {8, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8 - {16, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16 - {32, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32 - {64, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64 - }; - - if (DstRB.getID() != X86::GPRRegBankID) - return false; - - auto ShiftEntryIt = std::find_if( - std::begin(OpTable), std::end(OpTable), [DstTy](const ShiftEntry &El) { - return El.SizeInBits == DstTy.getSizeInBits(); - }); - if (ShiftEntryIt == std::end(OpTable)) - return false; - - unsigned Opcode = 0; - switch (I.getOpcode()) { - case TargetOpcode::G_SHL: - Opcode = ShiftEntryIt->OpSHL; - break; - case TargetOpcode::G_ASHR: - Opcode = ShiftEntryIt->OpASHR; - break; - case TargetOpcode::G_LSHR: - Opcode = ShiftEntryIt->OpLSHR; - break; - default: - return false; - } - - Register Op0Reg = I.getOperand(1).getReg(); - Register Op1Reg = I.getOperand(2).getReg(); - - assert(MRI.getType(Op1Reg).getSizeInBits() == 8); - - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), - X86::CL) - .addReg(Op1Reg); - - MachineInstr &ShiftInst = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg) - .addReg(Op0Reg); - - constrainSelectedInstRegOperands(ShiftInst, TII, TRI, RBI); - I.eraseFromParent(); - return true; -} - bool X86InstructionSelector::selectDivRem(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 1d7adbaa9e99..40bf28df3b90 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -15,6 +15,7 @@ #include "X86ISelLowering.h" #include "X86InstrInfo.h" +#include "llvm/IR/IntrinsicsX86.h" namespace llvm { diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp index 04121f863c89..da53d6420021 100644 --- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -77,7 +77,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizeScalarToDifferentSizeStrategy(MemOp, 0, narrowToSmallerAndWidenToSmallest); setLegalizeScalarToDifferentSizeStrategy( - G_GEP, 1, widenToLargerTypesUnsupportedOtherwise); + G_PTR_ADD, 1, widenToLargerTypesUnsupportedOtherwise); setLegalizeScalarToDifferentSizeStrategy( G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest); @@ -140,8 +140,8 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_FRAME_INDEX, p0}, Legal); setAction({G_GLOBAL_VALUE, p0}, Legal); - setAction({G_GEP, p0}, Legal); - setAction({G_GEP, 1, s32}, Legal); + setAction({G_PTR_ADD, p0}, Legal); + setAction({G_PTR_ADD, 1, s32}, Legal); if (!Subtarget.is64Bit()) { getActionDefinitionsBuilder(G_PTRTOINT) @@ -223,7 +223,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { setAction({MemOp, s64}, Legal); // Pointer-handling - setAction({G_GEP, 1, s64}, Legal); + setAction({G_PTR_ADD, 1, s64}, Legal); getActionDefinitionsBuilder(G_PTRTOINT) .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0}) .maxScalar(0, s64) diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 78098fd6262f..2fc9a2af01d7 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -569,6 +569,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) { unsigned NewOpc; switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break; case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break; case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break; @@ -640,6 +641,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) { unsigned NewOpc; switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break; case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break; case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break; @@ -876,6 +878,52 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { case X86::MOVSX64rr32: SimplifyMOVSX(OutMI); break; + + case X86::VCMPPDrri: + case X86::VCMPPDYrri: + case X86::VCMPPSrri: + case X86::VCMPPSYrri: + case X86::VCMPSDrr: + case X86::VCMPSSrr: { + // Swap the operands if it will enable a 2 byte VEX encoding. + // FIXME: Change the immediate to improve opportunities? + if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) && + X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) { + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + default: break; + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + std::swap(OutMI.getOperand(1), OutMI.getOperand(2)); + break; + } + } + break; + } + + case X86::VMOVHLPSrr: + case X86::VUNPCKHPDrr: + // These are not truly commutable so hide them from the default case. + break; + + default: { + // If the instruction is a commutable arithmetic instruction we might be + // able to commute the operands to get a 2 byte VEX prefix. + uint64_t TSFlags = MI->getDesc().TSFlags; + if (MI->getDesc().isCommutable() && + (TSFlags & X86II::EncodingMask) == X86II::VEX && + (TSFlags & X86II::OpMapMask) == X86II::TB && + (TSFlags & X86II::FormMask) == X86II::MRMSrcReg && + !(TSFlags & X86II::VEX_W) && (TSFlags & X86II::VEX_4V) && + OutMI.getNumOperands() == 3) { + if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) && + X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) + std::swap(OutMI.getOperand(1), OutMI.getOperand(2)); + } + break; + } } } @@ -983,13 +1031,32 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, } } +/// Return the longest nop which can be efficiently decoded for the given +/// target cpu. 15-bytes is the longest single NOP instruction, but some +/// platforms can't decode the longest forms efficiently. +static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) { + uint64_t MaxNopLength = 10; + if (STI.getFeatureBits()[X86::ProcIntelSLM]) + MaxNopLength = 7; + else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) + MaxNopLength = 15; + else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) + MaxNopLength = 11; + return MaxNopLength; +} + /// Emit the largest nop instruction smaller than or equal to \p NumBytes /// bytes. Return the size of nop emitted. static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSubtargetInfo &STI) { - // This works only for 64bit. For 32bit we have to do additional checking if - // the CPU supports multi-byte nops. - assert(Is64Bit && "EmitNops only supports X86-64"); + if (!Is64Bit) { + // TODO Do additional checking if the CPU supports multi-byte nops. + OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI); + return 1; + } + + // Cap a single nop emission at the profitable value for the target + NumBytes = std::min(NumBytes, MaxLongNopLength(STI)); unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; @@ -1094,10 +1161,35 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, } } +/// A RAII helper which defines a region of instructions which can't have +/// padding added between them for correctness. +struct NoAutoPaddingScope { + MCStreamer &OS; + const bool OldAllowAutoPadding; + NoAutoPaddingScope(MCStreamer &OS) + : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) { + changeAndComment(false); + } + ~NoAutoPaddingScope() { + changeAndComment(OldAllowAutoPadding); + } + void changeAndComment(bool b) { + if (b == OS.getAllowAutoPadding()) + return; + OS.setAllowAutoPadding(b); + if (b) + OS.emitRawComment("autopadding"); + else + OS.emitRawComment("noautopadding"); + } +}; + void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64"); + NoAutoPaddingScope NoPadScope(*OutStreamer); + StatepointOpers SOpers(&MI); if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(), @@ -1148,7 +1240,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, // Record our statepoint node in the same section used by STACKMAP // and PATCHPOINT - SM.recordStatepoint(MI); + auto &Ctx = OutStreamer->getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer->EmitLabel(MILabel); + SM.recordStatepoint(*MILabel, MI); } void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, @@ -1156,6 +1251,8 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>, // <opcode>, <operands> + NoAutoPaddingScope NoPadScope(*OutStreamer); + Register DefRegister = FaultingMI.getOperand(0).getReg(); FaultMaps::FaultKind FK = static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm()); @@ -1163,8 +1260,12 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, unsigned Opcode = FaultingMI.getOperand(3).getImm(); unsigned OperandsBeginIdx = 4; + auto &Ctx = OutStreamer->getContext(); + MCSymbol *FaultingLabel = Ctx.createTempSymbol(); + OutStreamer->EmitLabel(FaultingLabel); + assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!"); - FM.recordFaultingOp(FK, HandlerLabel); + FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel); MCInst MI; MI.setOpcode(Opcode); @@ -1199,6 +1300,8 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL) { // PATCHABLE_OP minsize, opcode, operands + NoAutoPaddingScope NoPadScope(*OutStreamer); + unsigned MinSize = MI.getOperand(0).getImm(); unsigned Opcode = MI.getOperand(1).getImm(); @@ -1236,7 +1339,12 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, // <id>, <shadowBytes>, ... void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); - SM.recordStackMap(MI); + + auto &Ctx = OutStreamer->getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer->EmitLabel(MILabel); + + SM.recordStackMap(*MILabel, MI); unsigned NumShadowBytes = MI.getOperand(1).getImm(); SMShadowTracker.reset(NumShadowBytes); } @@ -1249,7 +1357,12 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); - SM.recordPatchPoint(MI); + NoAutoPaddingScope NoPadScope(*OutStreamer); + + auto &Ctx = OutStreamer->getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer->EmitLabel(MILabel); + SM.recordPatchPoint(*MILabel, MI); PatchPointOpers opers(&MI); unsigned ScratchIdx = opers.getNextScratchIdx(); @@ -1305,6 +1418,8 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64"); + NoAutoPaddingScope NoPadScope(*OutStreamer); + // We want to emit the following pattern, which follows the x86 calling // convention to prepare for the trampoline call to be patched in. // @@ -1337,10 +1452,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, // The default C calling convention will place two arguments into %rcx and // %rdx -- so we only work with those. - unsigned DestRegs[] = {X86::RDI, X86::RSI}; + const Register DestRegs[] = {X86::RDI, X86::RSI}; bool UsedMask[] = {false, false}; // Filled out in loop. - unsigned SrcRegs[] = {0, 0}; + Register SrcRegs[] = {0, 0}; // Then we put the operands in the %rdi and %rsi registers. We spill the // values in the register before we clobber them, and mark them as used in @@ -1350,7 +1465,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, for (unsigned I = 0; I < MI.getNumOperands(); ++I) if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) { assert(Op->isReg() && "Only support arguments in registers"); - SrcRegs[I] = Op->getReg(); + SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64); if (SrcRegs[I] != DestRegs[I]) { UsedMask[I] = true; EmitAndCountInstruction( @@ -1361,6 +1476,9 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, } // Now that the register values are stashed, mov arguments into place. + // FIXME: This doesn't work if one of the later SrcRegs is equal to an + // earlier DestReg. We will have already overwritten over the register before + // we can copy from it. for (unsigned I = 0; I < MI.getNumOperands(); ++I) if (SrcRegs[I] != DestRegs[I]) EmitAndCountInstruction( @@ -1396,6 +1514,8 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64"); + NoAutoPaddingScope NoPadScope(*OutStreamer); + // We want to emit the following pattern, which follows the x86 calling // convention to prepare for the trampoline call to be patched in. // @@ -1429,11 +1549,11 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, // An x86-64 convention may place three arguments into %rcx, %rdx, and R8, // so we'll work with those. Or we may be called via SystemV, in which case // we don't have to do any translation. - unsigned DestRegs[] = {X86::RDI, X86::RSI, X86::RDX}; + const Register DestRegs[] = {X86::RDI, X86::RSI, X86::RDX}; bool UsedMask[] = {false, false, false}; // Will fill out src regs in the loop. - unsigned SrcRegs[] = {0, 0, 0}; + Register SrcRegs[] = {0, 0, 0}; // Then we put the operands in the SystemV registers. We spill the values in // the registers before we clobber them, and mark them as used in UsedMask. @@ -1443,7 +1563,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) { // TODO: Is register only support adequate? assert(Op->isReg() && "Only supports arguments in registers"); - SrcRegs[I] = Op->getReg(); + SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64); if (SrcRegs[I] != DestRegs[I]) { UsedMask[I] = true; EmitAndCountInstruction( @@ -1459,6 +1579,9 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, // is clobbers. We've already added nops to account for the size of mov and // push if the register is in the right place, so we only have to worry about // emitting movs. + // FIXME: This doesn't work if one of the later SrcRegs is equal to an + // earlier DestReg. We will have already overwritten over the register before + // we can copy from it. for (unsigned I = 0; I < MI.getNumOperands(); ++I) if (UsedMask[I]) EmitAndCountInstruction( @@ -1490,6 +1613,19 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, X86MCInstLower &MCIL) { + + NoAutoPaddingScope NoPadScope(*OutStreamer); + + const Function &F = MF->getFunction(); + if (F.hasFnAttribute("patchable-function-entry")) { + unsigned Num; + if (F.getFnAttribute("patchable-function-entry") + .getValueAsString() + .getAsInteger(10, Num)) + return; + EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo()); + return; + } // We want to emit the following pattern: // // .p2align 1, ... @@ -1517,6 +1653,8 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL) { + NoAutoPaddingScope NoPadScope(*OutStreamer); + // Since PATCHABLE_RET takes the opcode of the return statement as an // argument, we use that to emit the correct form of the RET that we want. // i.e. when we see this: @@ -1547,6 +1685,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) { + NoAutoPaddingScope NoPadScope(*OutStreamer); + // Like PATCHABLE_RET, we have the actual instruction in the operands to this // instruction so we lower that particular instruction and its operands. // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp index c6da4b09dd60..b19d1263e0c9 100644 --- a/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/X86BaseInfo.h" #include "X86MacroFusion.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MacroFusion.h" @@ -18,160 +19,13 @@ using namespace llvm; -namespace { - -// The classification for the first instruction. -enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid }; - -// The classification for the second instruction (jump). -enum class JumpKind { - // JE, JL, JG and variants. - ELG, - // JA, JB and variants. - AB, - // JS, JP, JO and variants. - SPO, - // Not a fusable jump. - Invalid, -}; - -} // namespace - -static FirstInstrKind classifyFirst(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return FirstInstrKind::Invalid; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - case X86::TEST8ri: - case X86::TEST16ri: - case X86::TEST32ri: - case X86::TEST64ri32: - case X86::TEST8mr: - case X86::TEST16mr: - case X86::TEST32mr: - case X86::TEST64mr: - return FirstInstrKind::Test; - case X86::AND16ri: - case X86::AND16ri8: - case X86::AND16rm: - case X86::AND16rr: - case X86::AND32ri: - case X86::AND32ri8: - case X86::AND32rm: - case X86::AND32rr: - case X86::AND64ri32: - case X86::AND64ri8: - case X86::AND64rm: - case X86::AND64rr: - case X86::AND8ri: - case X86::AND8rm: - case X86::AND8rr: - return FirstInstrKind::And; - case X86::CMP16ri: - case X86::CMP16ri8: - case X86::CMP16rm: - case X86::CMP16rr: - case X86::CMP16mr: - case X86::CMP32ri: - case X86::CMP32ri8: - case X86::CMP32rm: - case X86::CMP32rr: - case X86::CMP32mr: - case X86::CMP64ri32: - case X86::CMP64ri8: - case X86::CMP64rm: - case X86::CMP64rr: - case X86::CMP64mr: - case X86::CMP8ri: - case X86::CMP8rm: - case X86::CMP8rr: - case X86::CMP8mr: - return FirstInstrKind::Cmp; - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri8_DB: - case X86::ADD16ri_DB: - case X86::ADD16rm: - case X86::ADD16rr: - case X86::ADD16rr_DB: - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD32ri8_DB: - case X86::ADD32ri_DB: - case X86::ADD32rm: - case X86::ADD32rr: - case X86::ADD32rr_DB: - case X86::ADD64ri32: - case X86::ADD64ri32_DB: - case X86::ADD64ri8: - case X86::ADD64ri8_DB: - case X86::ADD64rm: - case X86::ADD64rr: - case X86::ADD64rr_DB: - case X86::ADD8ri: - case X86::ADD8ri_DB: - case X86::ADD8rm: - case X86::ADD8rr: - case X86::ADD8rr_DB: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB16rm: - case X86::SUB16rr: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB32rm: - case X86::SUB32rr: - case X86::SUB64ri32: - case X86::SUB64ri8: - case X86::SUB64rm: - case X86::SUB64rr: - case X86::SUB8ri: - case X86::SUB8rm: - case X86::SUB8rr: - return FirstInstrKind::ALU; - case X86::INC16r: - case X86::INC32r: - case X86::INC64r: - case X86::INC8r: - case X86::DEC16r: - case X86::DEC32r: - case X86::DEC64r: - case X86::DEC8r: - return FirstInstrKind::IncDec; - } +static X86::FirstMacroFusionInstKind classifyFirst(const MachineInstr &MI) { + return X86::classifyFirstOpcodeInMacroFusion(MI.getOpcode()); } -static JumpKind classifySecond(const MachineInstr &MI) { +static X86::SecondMacroFusionInstKind classifySecond(const MachineInstr &MI) { X86::CondCode CC = X86::getCondFromBranch(MI); - if (CC == X86::COND_INVALID) - return JumpKind::Invalid; - - switch (CC) { - default: - return JumpKind::Invalid; - case X86::COND_E: - case X86::COND_NE: - case X86::COND_L: - case X86::COND_LE: - case X86::COND_G: - case X86::COND_GE: - return JumpKind::ELG; - case X86::COND_B: - case X86::COND_BE: - case X86::COND_A: - case X86::COND_AE: - return JumpKind::AB; - case X86::COND_S: - case X86::COND_NS: - case X86::COND_P: - case X86::COND_NP: - case X86::COND_O: - case X86::COND_NO: - return JumpKind::SPO; - } + return X86::classifySecondCondCodeInMacroFusion(CC); } /// Check if the instr pair, FirstMI and SecondMI, should be fused @@ -187,40 +41,27 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, if (!(ST.hasBranchFusion() || ST.hasMacroFusion())) return false; - const JumpKind BranchKind = classifySecond(SecondMI); + const X86::SecondMacroFusionInstKind BranchKind = classifySecond(SecondMI); - if (BranchKind == JumpKind::Invalid) + if (BranchKind == X86::SecondMacroFusionInstKind::Invalid) return false; // Second cannot be fused with anything. if (FirstMI == nullptr) return true; // We're only checking whether Second can be fused at all. - const FirstInstrKind TestKind = classifyFirst(*FirstMI); + const X86::FirstMacroFusionInstKind TestKind = classifyFirst(*FirstMI); if (ST.hasBranchFusion()) { // Branch fusion can merge CMP and TEST with all conditional jumps. - return (TestKind == FirstInstrKind::Cmp || - TestKind == FirstInstrKind::Test); + return (TestKind == X86::FirstMacroFusionInstKind::Cmp || + TestKind == X86::FirstMacroFusionInstKind::Test); } if (ST.hasMacroFusion()) { - // Macro Fusion rules are a bit more complex. See Agner Fog's - // Microarchitecture table 9.2 "Instruction Fusion". - switch (TestKind) { - case FirstInstrKind::Test: - case FirstInstrKind::And: - return true; - case FirstInstrKind::Cmp: - case FirstInstrKind::ALU: - return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB; - case FirstInstrKind::IncDec: - return BranchKind == JumpKind::ELG; - case FirstInstrKind::Invalid: - return false; - } + return X86::isMacroFused(TestKind, BranchKind); } - llvm_unreachable("unknown branch fusion type"); + llvm_unreachable("unknown fusion type"); } namespace llvm { diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 1aee01563c4b..0c791b6674dc 100644 --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -25,6 +25,8 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -32,6 +34,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -247,6 +250,12 @@ public: static char ID; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>; @@ -294,9 +303,9 @@ private: DenseMap<const MachineInstr *, unsigned> InstrPos; - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; - const X86RegisterInfo *TRI; + MachineRegisterInfo *MRI = nullptr; + const X86InstrInfo *TII = nullptr; + const X86RegisterInfo *TRI = nullptr; }; } // end anonymous namespace @@ -681,6 +690,11 @@ bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + auto *PSI = + &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : + nullptr; // Process all basic blocks. for (auto &MBB : MF) { @@ -699,7 +713,9 @@ bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction().hasOptSize()) + bool OptForSize = MF.getFunction().hasOptSize() || + llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); + if (OptForSize) Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index af974c805c36..4c6bd0ccc2cd 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -17,8 +17,11 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Function.h" @@ -52,6 +55,12 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); @@ -105,6 +114,12 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { TSM.init(&MF.getSubtarget()); + auto *PSI = + &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) ? + &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : + nullptr; + // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); @@ -118,6 +133,11 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock *MBB = I->first; unsigned Cycles = I->second; + // Function::hasOptSize is already checked above. + bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI); + if (OptForSize) + continue; + if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions // trailing the terminator. diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td index 5610f4bc8873..93238983afa2 100644 --- a/llvm/lib/Target/X86/X86PfmCounters.td +++ b/llvm/lib/Target/X86/X86PfmCounters.td @@ -81,14 +81,14 @@ def HaswellPfmCounters : ProcPfmCounters { let CycleCounter = UnhaltedCoreCyclesPfmCounter; let UopsCounter = UopsIssuedPfmCounter; let IssueCounters = [ - PfmIssueCounter<"HWPort0", "uops_dispatched_port:port_0">, - PfmIssueCounter<"HWPort1", "uops_dispatched_port:port_1">, - PfmIssueCounter<"HWPort2", "uops_dispatched_port:port_2">, - PfmIssueCounter<"HWPort3", "uops_dispatched_port:port_3">, - PfmIssueCounter<"HWPort4", "uops_dispatched_port:port_4">, - PfmIssueCounter<"HWPort5", "uops_dispatched_port:port_5">, - PfmIssueCounter<"HWPort6", "uops_dispatched_port:port_6">, - PfmIssueCounter<"HWPort7", "uops_dispatched_port:port_7"> + PfmIssueCounter<"HWPort0", "uops_executed_port:port_0">, + PfmIssueCounter<"HWPort1", "uops_executed_port:port_1">, + PfmIssueCounter<"HWPort2", "uops_executed_port:port_2">, + PfmIssueCounter<"HWPort3", "uops_executed_port:port_3">, + PfmIssueCounter<"HWPort4", "uops_executed_port:port_4">, + PfmIssueCounter<"HWPort5", "uops_executed_port:port_5">, + PfmIssueCounter<"HWPort6", "uops_executed_port:port_6">, + PfmIssueCounter<"HWPort7", "uops_executed_port:port_7"> ]; } def : PfmCountersBinding<"haswell", HaswellPfmCounters>; diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp index daddf4231897..9c076d2d6769 100644 --- a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp @@ -40,8 +40,9 @@ X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); } -const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass( - const TargetRegisterClass &RC) const { +const RegisterBank & +X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const { if (X86::GR8RegClass.hasSubClassEq(&RC) || X86::GR16RegClass.hasSubClassEq(&RC) || diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.h b/llvm/lib/Target/X86/X86RegisterBankInfo.h index c1f3001c6180..d5afd2cae761 100644 --- a/llvm/lib/Target/X86/X86RegisterBankInfo.h +++ b/llvm/lib/Target/X86/X86RegisterBankInfo.h @@ -64,8 +64,8 @@ private: public: X86RegisterBankInfo(const TargetRegisterInfo &TRI); - const RegisterBank & - getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const override; InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index ff625325b4c9..f69626b2622e 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -341,6 +341,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return (HasSSE ? CSR_32_RegCall_SaveList : CSR_32_RegCall_NoSSE_SaveList); } + case CallingConv::CFGuard_Check: + assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86"); + return (HasSSE ? CSR_Win32_CFGuard_Check_SaveList + : CSR_Win32_CFGuard_Check_NoSSE_SaveList); case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; @@ -455,6 +459,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return (HasSSE ? CSR_32_RegCall_RegMask : CSR_32_RegCall_NoSSE_RegMask); } + case CallingConv::CFGuard_Check: + assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86"); + return (HasSSE ? CSR_Win32_CFGuard_Check_RegMask + : CSR_Win32_CFGuard_Check_NoSSE_RegMask); case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; @@ -515,24 +523,27 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the floating point control register as reserved. Reserved.set(X86::FPCW); + // Set the floating point status register as reserved. + Reserved.set(X86::FPSW); + + // Set the SIMD floating point control register as reserved. + Reserved.set(X86::MXCSR); + // Set the stack-pointer register and its aliases as reserved. - for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); - ++I) - Reserved.set(*I); + for (const MCPhysReg &SubReg : subregs_inclusive(X86::RSP)) + Reserved.set(SubReg); // Set the Shadow Stack Pointer as reserved. Reserved.set(X86::SSP); // Set the instruction pointer register and its aliases as reserved. - for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid(); - ++I) - Reserved.set(*I); + for (const MCPhysReg &SubReg : subregs_inclusive(X86::RIP)) + Reserved.set(SubReg); // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { - for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid(); - ++I) - Reserved.set(*I); + for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP)) + Reserved.set(SubReg); } // Set the base-pointer register and its aliases as reserved if needed. @@ -545,9 +556,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "this calling convention."); Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); - for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); - I.isValid(); ++I) - Reserved.set(*I); + for (const MCPhysReg &SubReg : subregs_inclusive(BasePtr)) + Reserved.set(SubReg); } // Mark the segment registers as reserved. diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 0528b90c1fd5..3cfaf714e93e 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -294,6 +294,11 @@ def FPSW : X86Reg<"fpsr", 0>; // Floating-point control word def FPCW : X86Reg<"fpcr", 0>; +// SIMD Floating-point control register. +// Note: We only model the "Uses" of the control bits: current rounding modes, +// DAZ, FTZ and exception masks. We don't model the "Defs" of flag bits. +def MXCSR : X86Reg<"mxcsr", 0>; + // Status flags register. // // Note that some flags that are commonly thought of as part of the status diff --git a/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/llvm/lib/Target/X86/X86RetpolineThunks.cpp index f8464c7e8298..9085d7f068ac 100644 --- a/llvm/lib/Target/X86/X86RetpolineThunks.cpp +++ b/llvm/lib/Target/X86/X86RetpolineThunks.cpp @@ -63,13 +63,13 @@ public: } private: - MachineModuleInfo *MMI; - const TargetMachine *TM; - bool Is64Bit; - const X86Subtarget *STI; - const X86InstrInfo *TII; + MachineModuleInfo *MMI = nullptr; + const TargetMachine *TM = nullptr; + bool Is64Bit = false; + const X86Subtarget *STI = nullptr; + const X86InstrInfo *TII = nullptr; - bool InsertedThunks; + bool InsertedThunks = false; void createThunkFunction(Module &M, StringRef Name); void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg); diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 78acb1065ec8..b0153ca9da36 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -888,8 +888,7 @@ def AtomWrite01_174 : SchedWriteRes<[AtomPort01]> { let Latency = 174; let ResourceCycles = [174]; } -def : InstRW<[AtomWrite01_174], (instrs FSINCOS)>; -def : InstRW<[AtomWrite01_174], (instregex "(COS|SIN)_F")>; +def : InstRW<[AtomWrite01_174], (instrs FSINCOS, FSIN, FCOS)>; def AtomWrite01_183 : SchedWriteRes<[AtomPort01]> { let Latency = 183; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 8e3ce721f1a1..dcd155ea0e0e 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -202,8 +202,8 @@ defm : SLMWriteResPair<WriteFAddX, [SLM_FPC_RSV1], 3>; defm : SLMWriteResPair<WriteFAddY, [SLM_FPC_RSV1], 3>; defm : X86WriteResPairUnsupported<WriteFAddZ>; defm : SLMWriteResPair<WriteFAdd64, [SLM_FPC_RSV1], 3>; -defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 3>; -defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 4, [2]>; +defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 4, [2]>; defm : X86WriteResPairUnsupported<WriteFAdd64Z>; defm : SLMWriteResPair<WriteFCmp, [SLM_FPC_RSV1], 3>; defm : SLMWriteResPair<WriteFCmpX, [SLM_FPC_RSV1], 3>; @@ -219,8 +219,8 @@ defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]> defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; defm : X86WriteResPairUnsupported<WriteFMulZ>; defm : SLMWriteResPair<WriteFMul64, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; -defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; -defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>; +defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>; defm : X86WriteResPairUnsupported<WriteFMul64Z>; defm : SLMWriteResPair<WriteFDiv, [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>; defm : SLMWriteResPair<WriteFDivX, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>; @@ -380,8 +380,8 @@ def : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> { // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 3, [2]>; -defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 3, [2]>; +defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 6, [6], 4>; +defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 6, [6], 4>; defm : X86WriteResPairUnsupported<WriteFHAddZ>; defm : SLMWriteResPair<WritePHAdd, [SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WritePHAddX, [SLM_FPC_RSV01], 1>; @@ -486,7 +486,7 @@ defm : X86WriteResPairUnsupported<WriteFBlendZ>; defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>; defm : X86WriteResPairUnsupported<WriteVarBlendY>; defm : X86WriteResPairUnsupported<WriteVarBlendZ>; -defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>; defm : X86WriteResPairUnsupported<WriteFVarBlendY>; defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; defm : X86WriteResPairUnsupported<WriteFShuffle256>; @@ -511,4 +511,20 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHSt>; defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>; defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; +// Remaining SLM instrs. + +def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4]; +} +def: InstRW<[SLMWriteResGroup1rr], (instrs PADDQrr, PSUBQrr, PCMPEQQrr)>; + +def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,4]; +} +def: InstRW<[SLMWriteResGroup1rm], (instrs PADDQrm, PSUBQrm, PCMPEQQrm)>; + } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td new file mode 100644 index 000000000000..4537d9cc7956 --- /dev/null +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -0,0 +1,1548 @@ +//=- X86ScheduleZnver2.td - X86 Znver2 Scheduling -------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver2 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Znver2Model : SchedMachineModel { + // Zen can decode 4 instructions per cycle. + let IssueWidth = 4; + // Based on the reorder buffer we define MicroOpBufferSize + let MicroOpBufferSize = 224; + let LoadLatency = 4; + let MispredictPenalty = 17; + let HighLatency = 25; + let PostRAScheduler = 1; + + // FIXME: This variable is required for incomplete model. + // We haven't catered all instructions. + // So, we reset the value of this variable so as to + // say that the model is incomplete. + let CompleteModel = 0; +} + +let SchedModel = Znver2Model in { + +// Zen can issue micro-ops to 10 different units in one cycle. +// These are +// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3) +// * Three AGU units (ZAGU0, ZAGU1, ZAGU2) +// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3) +// AGUs feed load store queues @two loads and 1 store per cycle. + +// Four ALU units are defined below +def Zn2ALU0 : ProcResource<1>; +def Zn2ALU1 : ProcResource<1>; +def Zn2ALU2 : ProcResource<1>; +def Zn2ALU3 : ProcResource<1>; + +// Three AGU units are defined below +def Zn2AGU0 : ProcResource<1>; +def Zn2AGU1 : ProcResource<1>; +def Zn2AGU2 : ProcResource<1>; + +// Four FPU units are defined below +def Zn2FPU0 : ProcResource<1>; +def Zn2FPU1 : ProcResource<1>; +def Zn2FPU2 : ProcResource<1>; +def Zn2FPU3 : ProcResource<1>; + +// FPU grouping +def Zn2FPU013 : ProcResGroup<[Zn2FPU0, Zn2FPU1, Zn2FPU3]>; +def Zn2FPU01 : ProcResGroup<[Zn2FPU0, Zn2FPU1]>; +def Zn2FPU12 : ProcResGroup<[Zn2FPU1, Zn2FPU2]>; +def Zn2FPU13 : ProcResGroup<[Zn2FPU1, Zn2FPU3]>; +def Zn2FPU23 : ProcResGroup<[Zn2FPU2, Zn2FPU3]>; +def Zn2FPU02 : ProcResGroup<[Zn2FPU0, Zn2FPU2]>; +def Zn2FPU03 : ProcResGroup<[Zn2FPU0, Zn2FPU3]>; + +// Below are the grouping of the units. +// Micro-ops to be issued to multiple units are tackled this way. + +// ALU grouping +// Zn2ALU03 - 0,3 grouping +def Zn2ALU03: ProcResGroup<[Zn2ALU0, Zn2ALU3]>; + +// 64 Entry (16x4 entries) Int Scheduler +def Zn2ALU : ProcResGroup<[Zn2ALU0, Zn2ALU1, Zn2ALU2, Zn2ALU3]> { + let BufferSize=64; +} + +// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations +// but are relevant for some instructions +def Zn2AGU : ProcResGroup<[Zn2AGU0, Zn2AGU1, Zn2AGU2]> { + let BufferSize=28; +} + +// Integer Multiplication issued on ALU1. +def Zn2Multiplier : ProcResource<1>; + +// Integer division issued on ALU2. +def Zn2Divider : ProcResource<1>; + +// 4 Cycles load-to use Latency is captured +def : ReadAdvance<ReadAfterLd, 4>; + +// 7 Cycles vector load-to use Latency is captured +def : ReadAdvance<ReadAfterVecLd, 7>; +def : ReadAdvance<ReadAfterVecXLd, 7>; +def : ReadAdvance<ReadAfterVecYLd, 7>; + +def : ReadAdvance<ReadInt2Fpu, 0>; + +// The Integer PRF for Zen is 168 entries, and it holds the architectural and +// speculative version of the 64-bit integer registers. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +def Zn2IntegerPRF : RegisterFile<168, [GR64, CCR]>; + +// 36 Entry (9x4 entries) floating-point Scheduler +def Zn2FPU : ProcResGroup<[Zn2FPU0, Zn2FPU1, Zn2FPU2, Zn2FPU3]> { + let BufferSize=36; +} + +// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit +// registers. Operations on 256-bit data types are cracked into two COPs. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +def Zn2FpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>; + +// The unit can track up to 192 macro ops in-flight. +// The retire unit handles in-order commit of up to 8 macro ops per cycle. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +// To be noted, the retire unit is shared between integer and FP ops. +// In SMT mode it is 96 entry per thread. But, we do not use the conservative +// value here because there is currently no way to fully mode the SMT mode, +// so there is no point in trying. +def Zn2RCU : RetireControlUnit<192, 8>; + +// (a folded load is an instruction that loads and does some operation) +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops. +// a. load and +// b. addpd +// This multiclass is for folded loads for integer units. +multiclass Zn2WriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [], int UOps = 1, + int LoadLat = 4, int LoadUOps = 1> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on Zn2AGU + // adds LoadLat cycles to the latency (default = 4). + def : WriteRes<SchedRW.Folded, !listconcat([Zn2AGU], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let NumMicroOps = !add(UOps, LoadUOps); + } +} + +// This multiclass is for folded loads for floating point units. +multiclass Zn2WriteResFpuPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [], int UOps = 1, + int LoadLat = 7, int LoadUOps = 0> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on Zn2AGU + // adds LoadLat cycles to the latency (default = 7). + def : WriteRes<SchedRW.Folded, !listconcat([Zn2AGU], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let NumMicroOps = !add(UOps, LoadUOps); + } +} + +// WriteRMW is set for instructions with Memory write +// operation in codegen +def : WriteRes<WriteRMW, [Zn2AGU]>; + +def : WriteRes<WriteStore, [Zn2AGU]>; +def : WriteRes<WriteStoreNT, [Zn2AGU]>; +def : WriteRes<WriteMove, [Zn2ALU]>; +def : WriteRes<WriteLoad, [Zn2AGU]> { let Latency = 8; } + +def : WriteRes<WriteZero, []>; +def : WriteRes<WriteLEA, [Zn2ALU]>; +defm : Zn2WriteResPair<WriteALU, [Zn2ALU], 1>; +defm : Zn2WriteResPair<WriteADC, [Zn2ALU], 1>; + +defm : Zn2WriteResPair<WriteIMul8, [Zn2ALU1, Zn2Multiplier], 4>; + +defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>; +defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>; +defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 1, [1], 1>; +defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>; +defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>; + +defm : Zn2WriteResPair<WriteShift, [Zn2ALU], 1>; +defm : Zn2WriteResPair<WriteShiftCL, [Zn2ALU], 1>; +defm : Zn2WriteResPair<WriteRotate, [Zn2ALU], 1>; +defm : Zn2WriteResPair<WriteRotateCL, [Zn2ALU], 1>; + +defm : X86WriteRes<WriteSHDrri, [Zn2ALU], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteSHDrrcl>; +defm : X86WriteResUnsupported<WriteSHDmri>; +defm : X86WriteResUnsupported<WriteSHDmrcl>; + +defm : Zn2WriteResPair<WriteJump, [Zn2ALU], 1>; +defm : Zn2WriteResFpuPair<WriteCRC32, [Zn2FPU0], 3>; + +defm : Zn2WriteResPair<WriteCMOV, [Zn2ALU], 1>; +def : WriteRes<WriteSETCC, [Zn2ALU]>; +def : WriteRes<WriteSETCCStore, [Zn2ALU, Zn2AGU]>; +defm : X86WriteRes<WriteLAHFSAHF, [Zn2ALU], 2, [1], 2>; + +defm : X86WriteRes<WriteBitTest, [Zn2ALU], 1, [1], 1>; +defm : X86WriteRes<WriteBitTestImmLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>; +defm : X86WriteRes<WriteBitTestRegLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>; +defm : X86WriteRes<WriteBitTestSet, [Zn2ALU], 2, [1], 2>; + +// Bit counts. +defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>; +defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 3>; +defm : Zn2WriteResPair<WriteLZCNT, [Zn2ALU], 1>; +defm : Zn2WriteResPair<WriteTZCNT, [Zn2ALU], 2>; +defm : Zn2WriteResPair<WritePOPCNT, [Zn2ALU], 1>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +// BMI1 BEXTR, BMI2 BZHI +defm : Zn2WriteResPair<WriteBEXTR, [Zn2ALU], 1>; +defm : Zn2WriteResPair<WriteBZHI, [Zn2ALU], 1>; + +// IDIV +defm : Zn2WriteResPair<WriteDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>; +defm : Zn2WriteResPair<WriteDiv16, [Zn2ALU2, Zn2Divider], 17, [1,17], 2>; +defm : Zn2WriteResPair<WriteDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>; +defm : Zn2WriteResPair<WriteDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>; +defm : Zn2WriteResPair<WriteIDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>; +defm : Zn2WriteResPair<WriteIDiv16, [Zn2ALU2, Zn2Divider], 17, [1,17], 2>; +defm : Zn2WriteResPair<WriteIDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>; +defm : Zn2WriteResPair<WriteIDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>; + +// IMULH +def : WriteRes<WriteIMulH, [Zn2ALU1, Zn2Multiplier]>{ + let Latency = 4; +} + +// Floating point operations +defm : X86WriteRes<WriteFLoad, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>; + +defm : X86WriteRes<WriteFStore, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreX, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreY, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreNT, [Zn2AGU,Zn2FPU2], 8, [1,1], 1>; +defm : X86WriteRes<WriteFStoreNTX, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreNTY, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteFMove, [Zn2FPU], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [Zn2FPU], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [Zn2FPU], 1, [1], 1>; + +defm : Zn2WriteResFpuPair<WriteFAdd, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFAddX, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFAddY, [Zn2FPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; +defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; +defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFBlend, [Zn2FPU01], 1>; +defm : Zn2WriteResFpuPair<WriteFBlendY, [Zn2FPU01], 1>; +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : Zn2WriteResFpuPair<WriteFVarBlend, [Zn2FPU01], 1>; +defm : Zn2WriteResFpuPair<WriteFVarBlendY,[Zn2FPU01], 1>; +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; +defm : Zn2WriteResFpuPair<WriteVarBlend, [Zn2FPU0], 1>; +defm : Zn2WriteResFpuPair<WriteVarBlendY, [Zn2FPU0], 1>; +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : Zn2WriteResFpuPair<WriteCvtSS2I, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtPS2I, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtPS2IY, [Zn2FPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : Zn2WriteResFpuPair<WriteCvtSD2I, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtPD2I, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtPD2IY, [Zn2FPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; +defm : Zn2WriteResFpuPair<WriteCvtI2SS, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtI2PS, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtI2PSY, [Zn2FPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : Zn2WriteResFpuPair<WriteCvtI2SD, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtI2PD, [Zn2FPU3], 5>; +defm : Zn2WriteResFpuPair<WriteCvtI2PDY, [Zn2FPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; +defm : Zn2WriteResFpuPair<WriteFDiv, [Zn2FPU3], 15>; +defm : Zn2WriteResFpuPair<WriteFDivX, [Zn2FPU3], 15>; +defm : X86WriteResPairUnsupported<WriteFDivZ>; +defm : Zn2WriteResFpuPair<WriteFDiv64, [Zn2FPU3], 15>; +defm : Zn2WriteResFpuPair<WriteFDiv64X, [Zn2FPU3], 15>; +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; +defm : Zn2WriteResFpuPair<WriteFSign, [Zn2FPU3], 2>; +defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 4, [1], 1, 7, 0>; +defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 4, [1], 1, 7, 0>; +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : Zn2WriteResFpuPair<WriteFLogic, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteFLogicY, [Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : Zn2WriteResFpuPair<WriteFTest, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteFTestY, [Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : Zn2WriteResFpuPair<WriteFShuffle, [Zn2FPU12], 1>; +defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>; +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 1>; +defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 1>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : Zn2WriteResFpuPair<WriteFMul, [Zn2FPU01], 3, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteFMulX, [Zn2FPU01], 3, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 4, [1], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : Zn2WriteResFpuPair<WriteFMul64, [Zn2FPU01], 3, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteFMul64X, [Zn2FPU01], 3, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 4, [1], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFMul64Z>; +defm : Zn2WriteResFpuPair<WriteFMA, [Zn2FPU03], 5>; +defm : Zn2WriteResFpuPair<WriteFMAX, [Zn2FPU03], 5>; +defm : Zn2WriteResFpuPair<WriteFMAY, [Zn2FPU03], 5>; +defm : X86WriteResPairUnsupported<WriteFMAZ>; +defm : Zn2WriteResFpuPair<WriteFRcp, [Zn2FPU01], 5>; +defm : Zn2WriteResFpuPair<WriteFRcpX, [Zn2FPU01], 5>; +defm : Zn2WriteResFpuPair<WriteFRcpY, [Zn2FPU01], 5, [1], 1, 7, 2>; +defm : X86WriteResPairUnsupported<WriteFRcpZ>; +defm : Zn2WriteResFpuPair<WriteFRsqrtX, [Zn2FPU01], 5, [1], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; +defm : Zn2WriteResFpuPair<WriteFSqrt, [Zn2FPU3], 20, [20]>; +defm : Zn2WriteResFpuPair<WriteFSqrtX, [Zn2FPU3], 20, [20]>; +defm : Zn2WriteResFpuPair<WriteFSqrtY, [Zn2FPU3], 28, [28], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : Zn2WriteResFpuPair<WriteFSqrt64, [Zn2FPU3], 20, [20]>; +defm : Zn2WriteResFpuPair<WriteFSqrt64X, [Zn2FPU3], 20, [20]>; +defm : Zn2WriteResFpuPair<WriteFSqrt64Y, [Zn2FPU3], 20, [20], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : Zn2WriteResFpuPair<WriteFSqrt80, [Zn2FPU3], 20, [20]>; + +// Vector integer operations which uses FPU units +defm : X86WriteRes<WriteVecLoad, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [Zn2AGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,2], 2>; +defm : X86WriteRes<WriteVecMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,2], 2>; +defm : X86WriteRes<WriteVecStore, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreX, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreY, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreNT, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreNTY, [Zn2AGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMaskedStore, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStoreY, [Zn2AGU,Zn2FPU01], 5, [1,1], 2>; +defm : X86WriteRes<WriteVecMove, [Zn2FPU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [Zn2FPU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [Zn2FPU], 2, [1], 2>; +defm : X86WriteRes<WriteVecMoveToGpr, [Zn2FPU2], 2, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [Zn2FPU2], 3, [1], 1>; +defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>; + +defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecShiftX, [Zn2FPU2], 1>; +defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 2>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; +defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : Zn2WriteResFpuPair<WriteVecLogic, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecLogicX, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecLogicY, [Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : Zn2WriteResFpuPair<WriteVecTest, [Zn2FPU12], 1, [2], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteVecTestY, [Zn2FPU12], 1, [2], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : Zn2WriteResFpuPair<WriteVecALU, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecALUX, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVecALUY, [Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : Zn2WriteResFpuPair<WriteVecIMul, [Zn2FPU0], 4>; +defm : Zn2WriteResFpuPair<WriteVecIMulX, [Zn2FPU0], 4>; +defm : Zn2WriteResFpuPair<WriteVecIMulY, [Zn2FPU0], 4>; +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +defm : Zn2WriteResFpuPair<WritePMULLD, [Zn2FPU0], 4, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 3, [1], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : Zn2WriteResFpuPair<WriteShuffle, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteShuffleX, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteShuffleY, [Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : Zn2WriteResFpuPair<WriteVarShuffle, [Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVarShuffleX,[Zn2FPU], 1>; +defm : Zn2WriteResFpuPair<WriteVarShuffleY,[Zn2FPU], 1>; +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : Zn2WriteResFpuPair<WriteBlend, [Zn2FPU01], 1>; +defm : Zn2WriteResFpuPair<WriteBlendY, [Zn2FPU01], 1>; +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : Zn2WriteResFpuPair<WriteShuffle256, [Zn2FPU], 2>; +defm : Zn2WriteResFpuPair<WriteVarShuffle256, [Zn2FPU], 2>; +defm : Zn2WriteResFpuPair<WritePSADBW, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WritePSADBWX, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WritePSADBWY, [Zn2FPU0], 3>; +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : Zn2WriteResFpuPair<WritePHMINPOS, [Zn2FPU0], 4>; + +// Vector Shift Operations +defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 1>; +defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 1>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; + +// Vector insert/extract operations. +defm : Zn2WriteResFpuPair<WriteVecInsert, [Zn2FPU], 1>; + +def : WriteRes<WriteVecExtract, [Zn2FPU12, Zn2FPU2]> { + let Latency = 2; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteVecExtractSt, [Zn2AGU, Zn2FPU12, Zn2FPU2]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 2, 3]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [Zn2FPU2]>; +def : WriteRes<WriteMMXMOVMSK, [Zn2FPU2]>; +def : WriteRes<WriteVecMOVMSK, [Zn2FPU2]>; + +def : WriteRes<WriteVecMOVMSKY, [Zn2FPU2]> { + let NumMicroOps = 2; + let Latency = 2; + let ResourceCycles = [2]; +} + +// AES Instructions. +defm : Zn2WriteResFpuPair<WriteAESDecEnc, [Zn2FPU01], 4>; +defm : Zn2WriteResFpuPair<WriteAESIMC, [Zn2FPU01], 4>; +defm : Zn2WriteResFpuPair<WriteAESKeyGen, [Zn2FPU01], 4>; + +def : WriteRes<WriteFence, [Zn2AGU]>; +def : WriteRes<WriteNop, []>; + +// Following instructions with latency=100 are microcoded. +// We set long latency so as to block the entire pipeline. +defm : Zn2WriteResFpuPair<WriteFShuffle256, [Zn2FPU], 100>; +defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>; + +// Microcoded Instructions +def Zn2WriteMicrocoded : SchedWriteRes<[]> { + let Latency = 100; +} + +def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>; +def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>; +def : SchedAlias<WriteSystem, Zn2WriteMicrocoded>; +def : SchedAlias<WriteMPSAD, Zn2WriteMicrocoded>; +def : SchedAlias<WriteMPSADY, Zn2WriteMicrocoded>; +def : SchedAlias<WriteMPSADLd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteMPSADYLd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteCLMul, Zn2WriteMicrocoded>; +def : SchedAlias<WriteCLMulLd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpIStrM, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpIStrMLd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpEStrI, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpEStrILd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpEStrM, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpEStrMLd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpIStrI, Zn2WriteMicrocoded>; +def : SchedAlias<WritePCmpIStrILd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteLDMXCSR, Zn2WriteMicrocoded>; +def : SchedAlias<WriteSTMXCSR, Zn2WriteMicrocoded>; + +//=== Regex based InstRW ===// +// Notation: +// - r: register. +// - m = memory. +// - i = immediate +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. + +//=== Integer Instructions ===// +//-- Move instructions --// +// MOV. +// r16,m. +def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>; + +// MOVSX, MOVZX. +// r,m. +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; + +// XCHG. +// r,r. +def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> { + let NumMicroOps = 2; +} + +def : InstRW<[Zn2WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; + +// r,m. +def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> { + let Latency = 5; + let NumMicroOps = 2; +} +def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>; + +def : InstRW<[WriteMicrocoded], (instrs XLAT)>; + +// POP16. +// r. +def Zn2WritePop16r : SchedWriteRes<[Zn2AGU]>{ + let Latency = 5; + let NumMicroOps = 2; +} +def : InstRW<[Zn2WritePop16r], (instregex "POP16rmm")>; +def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>; +def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>; + + +// PUSH. +// r. Has default values. +// m. +def Zn2WritePUSH : SchedWriteRes<[Zn2AGU]>{ + let Latency = 4; +} +def : InstRW<[Zn2WritePUSH], (instregex "PUSH(16|32)rmm")>; + +//PUSHF +def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>; + +// PUSHA. +def Zn2WritePushA : SchedWriteRes<[Zn2AGU]> { + let Latency = 8; +} +def : InstRW<[Zn2WritePushA], (instregex "PUSHA(16|32)")>; + +//LAHF +def : InstRW<[WriteMicrocoded], (instrs LAHF)>; + +// MOVBE. +// r,m. +def Zn2WriteMOVBE : SchedWriteRes<[Zn2AGU, Zn2ALU]> { + let Latency = 5; +} +def : InstRW<[Zn2WriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>; + +// m16,r16. +def : InstRW<[Zn2WriteMOVBE], (instregex "MOVBE(16|32|64)mr")>; + +//-- Arithmetic instructions --// + +// ADD SUB. +// m,r/i. +def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", + "(ADD|SUB)(8|16|32|64)mi8", + "(ADD|SUB)64mi32")>; + +// ADC SBB. +// m,r/i. +def : InstRW<[WriteALULd], + (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", + "(ADC|SBB)(16|32|64)mi8", + "(ADC|SBB)64mi32")>; + +// INC DEC NOT NEG. +// m. +def : InstRW<[WriteALULd], + (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>; + +// MUL IMUL. +// r16. +def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { + let Latency = 3; +} +def : SchedAlias<WriteIMul16, Zn2WriteMul16>; +def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16>; +def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>; + +// m16. +def Zn2WriteMul16Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { + let Latency = 7; +} +def : SchedAlias<WriteIMul16Ld, Zn2WriteMul16Ld>; +def : SchedAlias<WriteIMul16ImmLd, Zn2WriteMul16Ld>; +def : SchedAlias<WriteIMul16RegLd, Zn2WriteMul16Ld>; + +// r32. +def Zn2WriteMul32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { + let Latency = 3; +} +def : SchedAlias<WriteIMul32, Zn2WriteMul32>; +def : SchedAlias<WriteIMul32Imm, Zn2WriteMul32>; +def : SchedAlias<WriteIMul32Reg, Zn2WriteMul32>; + +// m32. +def Zn2WriteMul32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { + let Latency = 7; +} +def : SchedAlias<WriteIMul32Ld, Zn2WriteMul32Ld>; +def : SchedAlias<WriteIMul32ImmLd, Zn2WriteMul32Ld>; +def : SchedAlias<WriteIMul32RegLd, Zn2WriteMul32Ld>; + +// r64. +def Zn2WriteMul64 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : SchedAlias<WriteIMul64, Zn2WriteMul64>; +def : SchedAlias<WriteIMul64Imm, Zn2WriteMul64>; +def : SchedAlias<WriteIMul64Reg, Zn2WriteMul64>; + +// m64. +def Zn2WriteMul64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { + let Latency = 8; + let NumMicroOps = 2; +} +def : SchedAlias<WriteIMul64Ld, Zn2WriteMul64Ld>; +def : SchedAlias<WriteIMul64ImmLd, Zn2WriteMul64Ld>; +def : SchedAlias<WriteIMul64RegLd, Zn2WriteMul64Ld>; + +// MULX. +// r32,r32,r32. +def Zn2WriteMulX32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { + let Latency = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[Zn2WriteMulX32], (instrs MULX32rr)>; + +// r32,r32,m32. +def Zn2WriteMulX32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { + let Latency = 7; + let ResourceCycles = [1, 2, 2]; +} +def : InstRW<[Zn2WriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>; + +// r64,r64,r64. +def Zn2WriteMulX64 : SchedWriteRes<[Zn2ALU1]> { + let Latency = 3; +} +def : InstRW<[Zn2WriteMulX64], (instrs MULX64rr)>; + +// r64,r64,m64. +def Zn2WriteMulX64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { + let Latency = 7; +} +def : InstRW<[Zn2WriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>; + +//-- Control transfer instructions --// + +// J(E|R)CXZ. +def Zn2WriteJCXZ : SchedWriteRes<[Zn2ALU03]>; +def : InstRW<[Zn2WriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>; + +// INTO +def : InstRW<[WriteMicrocoded], (instrs INTO)>; + +// LOOP. +def Zn2WriteLOOP : SchedWriteRes<[Zn2ALU03]>; +def : InstRW<[Zn2WriteLOOP], (instrs LOOP)>; + +// LOOP(N)E, LOOP(N)Z +def Zn2WriteLOOPE : SchedWriteRes<[Zn2ALU03]>; +def : InstRW<[Zn2WriteLOOPE], (instrs LOOPE, LOOPNE)>; + +// CALL. +// r. +def Zn2WriteCALLr : SchedWriteRes<[Zn2AGU, Zn2ALU03]>; +def : InstRW<[Zn2WriteCALLr], (instregex "CALL(16|32)r")>; + +def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>; + +// RET. +def Zn2WriteRET : SchedWriteRes<[Zn2ALU03]> { + let NumMicroOps = 2; +} +def : InstRW<[Zn2WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)", + "IRET(16|32|64)")>; + +//-- Logic instructions --// + +// AND OR XOR. +// m,r/i. +def : InstRW<[WriteALULd], + (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", + "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; + +// Define ALU latency variants +def Zn2WriteALULat2 : SchedWriteRes<[Zn2ALU]> { + let Latency = 2; +} +def Zn2WriteALULat2Ld : SchedWriteRes<[Zn2AGU, Zn2ALU]> { + let Latency = 6; +} + +// BT. +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; + +// BTR BTS BTC. +// r,r,i. +def Zn2WriteBTRSC : SchedWriteRes<[Zn2ALU]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : InstRW<[Zn2WriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; + +// m,r,i. +def Zn2WriteBTRSCm : SchedWriteRes<[Zn2AGU, Zn2ALU]> { + let Latency = 6; + let NumMicroOps = 2; +} +// m,r,i. +def : SchedAlias<WriteBitTestSetImmRMW, Zn2WriteBTRSCm>; +def : SchedAlias<WriteBitTestSetRegRMW, Zn2WriteBTRSCm>; + +// BLSI BLSMSK BLSR. +// r,r. +def : SchedAlias<WriteBLS, Zn2WriteALULat2>; +// r,m. +def : SchedAlias<WriteBLSLd, Zn2WriteALULat2Ld>; + +// CLD STD. +def : InstRW<[WriteALU], (instrs STD, CLD)>; + +// PDEP PEXT. +// r,r,r. +def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; +// r,r,m. +def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; + +// RCR RCL. +// m,i. +def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>; + +// SHR SHL SAR. +// m,i. +def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; + +// SHRD SHLD. +// m,r +def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>; + +// r,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>; + +// m,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>; + +//-- Misc instructions --// +// CMPXCHG8B. +def Zn2WriteCMPXCHG8B : SchedWriteRes<[Zn2AGU, Zn2ALU]> { + let NumMicroOps = 18; +} +def : InstRW<[Zn2WriteCMPXCHG8B], (instrs CMPXCHG8B)>; + +def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>; + +// LEAVE +def Zn2WriteLEAVE : SchedWriteRes<[Zn2ALU, Zn2AGU]> { + let Latency = 8; + let NumMicroOps = 2; +} +def : InstRW<[Zn2WriteLEAVE], (instregex "LEAVE")>; + +// PAUSE. +def : InstRW<[WriteMicrocoded], (instrs PAUSE)>; + +// RDTSC. +def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>; + +// RDPMC. +def : InstRW<[WriteMicrocoded], (instrs RDPMC)>; + +// RDRAND. +def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>; + +// XGETBV. +def : InstRW<[WriteMicrocoded], (instregex "XGETBV")>; + +//-- String instructions --// +// CMPS. +def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>; + +// LODSB/W. +def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>; + +// MOVS. +def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>; + +// SCAS. +def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>; + +// STOS +def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>; + +// XADD. +def Zn2XADD : SchedWriteRes<[Zn2ALU]>; +def : InstRW<[Zn2XADD], (instregex "XADD(8|16|32|64)rr")>; +def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +def Zn2WriteFLDr : SchedWriteRes<[Zn2FPU13]> ; + +def Zn2WriteSTr: SchedWriteRes<[Zn2FPU23]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// LD_F. +// r. +def : InstRW<[Zn2WriteFLDr], (instregex "LD_Frr")>; + +// m. +def Zn2WriteLD_F80m : SchedWriteRes<[Zn2AGU, Zn2FPU13]> { + let NumMicroOps = 2; +} +def : InstRW<[Zn2WriteLD_F80m], (instregex "LD_F80m")>; + +// FBLD. +def : InstRW<[WriteMicrocoded], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[Zn2WriteSTr], (instregex "ST_(F|FP)rr")>; + +// m80. +def Zn2WriteST_FP80m : SchedWriteRes<[Zn2AGU, Zn2FPU23]> { + let Latency = 5; +} +def : InstRW<[Zn2WriteST_FP80m], (instregex "ST_FP80m")>; + +// FBSTP. +// m80. +def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>; + +def Zn2WriteFXCH : SchedWriteRes<[Zn2FPU]>; + +// FXCHG. +def : InstRW<[Zn2WriteFXCH], (instrs XCH_F)>; + +// FILD. +def Zn2WriteFILD : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 11; + let NumMicroOps = 2; +} +def : InstRW<[Zn2WriteFILD], (instregex "ILD_F(16|32|64)m")>; + +// FIST(P) FISTTP. +def Zn2WriteFIST : SchedWriteRes<[Zn2AGU, Zn2FPU23]> { + let Latency = 12; +} +def : InstRW<[Zn2WriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>; + +def Zn2WriteFPU13 : SchedWriteRes<[Zn2AGU, Zn2FPU13]> { + let Latency = 8; +} + +def Zn2WriteFPU3 : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 11; +} + +// FLDZ. +def : SchedAlias<WriteFLD0, Zn2WriteFPU13>; + +// FLD1. +def : SchedAlias<WriteFLD1, Zn2WriteFPU3>; + +// FLDPI FLDL2E etc. +def : SchedAlias<WriteFLDC, Zn2WriteFPU3>; + +// FNSTSW. +// AX. +def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>; + +// m16. +def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>; + +// FLDCW. +def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>; + +// FNSTCW. +def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>; + +// FINCSTP FDECSTP. +def : InstRW<[Zn2WriteFPU3], (instrs FINCSTP, FDECSTP)>; + +// FFREE. +def : InstRW<[Zn2WriteFPU3], (instregex "FFREE")>; + +// FNSAVE. +def : InstRW<[WriteMicrocoded], (instregex "FSAVEm")>; + +// FRSTOR. +def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +def Zn2WriteFPU3Lat1 : SchedWriteRes<[Zn2FPU3]> ; + +def Zn2WriteFPU0Lat1 : SchedWriteRes<[Zn2FPU0]> ; + +def Zn2WriteFPU0Lat1Ld : SchedWriteRes<[Zn2AGU, Zn2FPU0]> { + let Latency = 8; +} + +// FCHS. +def : InstRW<[Zn2WriteFPU3Lat1], (instregex "CHS_F")>; + +// FCOM(P) FUCOM(P). +// r. +def : InstRW<[Zn2WriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>; +// m. +def : InstRW<[Zn2WriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>; + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[Zn2WriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>; + +def Zn2WriteFPU02 : SchedWriteRes<[Zn2AGU, Zn2FPU02]> +{ + let Latency = 9; +} + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[Zn2WriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>; + +def Zn2WriteFPU03 : SchedWriteRes<[Zn2AGU, Zn2FPU03]> +{ + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,3]; +} + +// FICOM(P). +def : InstRW<[Zn2WriteFPU03], (instregex "FICOM(P?)(16|32)m")>; + +// FTST. +def : InstRW<[Zn2WriteFPU0Lat1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[Zn2WriteFPU3Lat1], (instrs FXAM)>; + +// FPREM. +def : InstRW<[WriteMicrocoded], (instrs FPREM)>; + +// FPREM1. +def : InstRW<[WriteMicrocoded], (instrs FPREM1)>; + +// FRNDINT. +def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>; + +// FSCALE. +def : InstRW<[WriteMicrocoded], (instrs FSCALE)>; + +// FXTRACT. +def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>; + +// FNOP. +def : InstRW<[Zn2WriteFPU0Lat1], (instrs FNOP)>; + +// WAIT. +def : InstRW<[Zn2WriteFPU0Lat1], (instrs WAIT)>; + +// FNCLEX. +def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>; + +// FNINIT. +def : InstRW<[WriteMicrocoded], (instrs FNINIT)>; + +//=== Integer MMX and XMM Instructions ===// + +// PACKSSWB/DW. +// mm <- mm. +def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ; +def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> { + let NumMicroOps = 2; +} +def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ; +def Zn2WriteFPU12Ym : SchedWriteRes<[Zn2AGU, Zn2FPU12]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWirr, + MMX_PACKSSWBirr, + MMX_PACKUSWBirr)>; +def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWirm, + MMX_PACKSSWBirm, + MMX_PACKUSWBirm)>; + +// VPMOVSX/ZX BW BD BQ WD WQ DQ. +// y <- x. +def : InstRW<[Zn2WriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>; +def : InstRW<[Zn2WriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>; + +def Zn2WriteFPU013 : SchedWriteRes<[Zn2FPU013]> ; +def Zn2WriteFPU013Y : SchedWriteRes<[Zn2FPU013]> ; +def Zn2WriteFPU013m : SchedWriteRes<[Zn2AGU, Zn2FPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} +def Zn2WriteFPU013Ld : SchedWriteRes<[Zn2AGU, Zn2FPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} +def Zn2WriteFPU013LdY : SchedWriteRes<[Zn2AGU, Zn2FPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} + +// PBLENDW. +// x,x,i / v,v,v,i +def : InstRW<[Zn2WriteFPU013], (instregex "(V?)PBLENDWrri")>; +// ymm +def : InstRW<[Zn2WriteFPU013Y], (instrs VPBLENDWYrri)>; + +// x,m,i / v,v,m,i +def : InstRW<[Zn2WriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>; +// y,m,i +def : InstRW<[Zn2WriteFPU013LdY], (instrs VPBLENDWYrmi)>; + +def Zn2WriteFPU01 : SchedWriteRes<[Zn2FPU01]> ; +def Zn2WriteFPU01Y : SchedWriteRes<[Zn2FPU01]> { + let NumMicroOps = 2; +} + +// VPBLENDD. +// v,v,v,i. +def : InstRW<[Zn2WriteFPU01], (instrs VPBLENDDrri)>; +// ymm +def : InstRW<[Zn2WriteFPU01Y], (instrs VPBLENDDYrri)>; + +// v,v,m,i +def Zn2WriteFPU01Op2 : SchedWriteRes<[Zn2AGU, Zn2FPU01]> { + let NumMicroOps = 2; + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def Zn2WriteFPU01Op2Y : SchedWriteRes<[Zn2AGU, Zn2FPU01]> { + let NumMicroOps = 2; + let Latency = 9; + let ResourceCycles = [1, 3]; +} +def : InstRW<[Zn2WriteFPU01Op2], (instrs VPBLENDDrmi)>; +def : InstRW<[Zn2WriteFPU01Op2Y], (instrs VPBLENDDYrmi)>; + +// MASKMOVQ. +def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>; + +// MASKMOVDQU. +def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>; + +// VPMASKMOVD. +// ymm +def : InstRW<[WriteMicrocoded], + (instregex "VPMASKMOVD(Y?)rm")>; +// m, v,v. +def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; + +// VPBROADCAST B/W. +// x, m8/16. +def Zn2WriteVPBROADCAST128Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[Zn2WriteVPBROADCAST128Ld], + (instregex "VPBROADCAST(B|W)rm")>; + +// y, m8/16 +def Zn2WriteVPBROADCAST256Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[Zn2WriteVPBROADCAST256Ld], + (instregex "VPBROADCAST(B|W)Yrm")>; + +// VPGATHER. +def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +// PHADD|PHSUB (S) W/D. +def : SchedAlias<WritePHAdd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePHAddLd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePHAddX, Zn2WriteMicrocoded>; +def : SchedAlias<WritePHAddXLd, Zn2WriteMicrocoded>; +def : SchedAlias<WritePHAddY, Zn2WriteMicrocoded>; +def : SchedAlias<WritePHAddYLd, Zn2WriteMicrocoded>; + +// PCMPGTQ. +def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>; +def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +// x <- x,m. +def Zn2WritePCMPGTQm : SchedWriteRes<[Zn2AGU, Zn2FPU03]> { + let Latency = 8; +} +// ymm. +def Zn2WritePCMPGTQYm : SchedWriteRes<[Zn2AGU, Zn2FPU03]> { + let Latency = 8; +} +def : InstRW<[Zn2WritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>; +def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>; + +//-- Logic instructions --// + +// PSLL,PSRL,PSRA W/D/Q. +// x,x / v,v,x. +def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> ; +def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> ; + +// PSLL,PSRL DQ. +def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>; +def : InstRW<[Zn2WritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>; + +//=== Floating Point XMM and YMM Instructions ===// +//-- Move instructions --// + +// VPERM2F128. +def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>; +def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>; + +def Zn2WriteBROADCAST : SchedWriteRes<[Zn2AGU, Zn2FPU13]> { + let NumMicroOps = 2; + let Latency = 8; +} +// VBROADCASTF128. +def : InstRW<[Zn2WriteBROADCAST], (instrs VBROADCASTF128)>; + +// EXTRACTPS. +// r32,x,i. +def Zn2WriteEXTRACTPSr : SchedWriteRes<[Zn2FPU12, Zn2FPU2]> { + let Latency = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[Zn2WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; + +def Zn2WriteEXTRACTPSm : SchedWriteRes<[Zn2AGU,Zn2FPU12, Zn2FPU2]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [5, 1, 2]; +} +// m32,x,i. +def : InstRW<[Zn2WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; + +// VEXTRACTF128. +// x,y,i. +def : InstRW<[Zn2WriteFPU013], (instrs VEXTRACTF128rr)>; + +// m128,y,i. +def : InstRW<[Zn2WriteFPU013m], (instrs VEXTRACTF128mr)>; + +def Zn2WriteVINSERT128r: SchedWriteRes<[Zn2FPU013]> { + let Latency = 2; +// let ResourceCycles = [2]; +} +def Zn2WriteVINSERT128Ld: SchedWriteRes<[Zn2AGU,Zn2FPU013]> { + let Latency = 9; + let NumMicroOps = 2; +} +// VINSERTF128. +// y,y,x,i. +def : InstRW<[Zn2WriteVINSERT128r], (instrs VINSERTF128rr)>; +def : InstRW<[Zn2WriteVINSERT128Ld], (instrs VINSERTF128rm)>; + +// VGATHER. +def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>; + +//-- Conversion instructions --// +def Zn2WriteCVTPD2PSr: SchedWriteRes<[Zn2FPU3]> { + let Latency = 3; +} +def Zn2WriteCVTPD2PSYr: SchedWriteRes<[Zn2FPU3]> { + let Latency = 3; +} + +// CVTPD2PS. +// x,x. +def : SchedAlias<WriteCvtPD2PS, Zn2WriteCVTPD2PSr>; +// y,y. +def : SchedAlias<WriteCvtPD2PSY, Zn2WriteCVTPD2PSYr>; +// z,z. +defm : X86WriteResUnsupported<WriteCvtPD2PSZ>; + +def Zn2WriteCVTPD2PSLd: SchedWriteRes<[Zn2AGU,Zn2FPU03]> { + let Latency = 10; + let NumMicroOps = 2; +} +// x,m128. +def : SchedAlias<WriteCvtPD2PSLd, Zn2WriteCVTPD2PSLd>; + +// x,m256. +def Zn2WriteCVTPD2PSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 10; +} +def : SchedAlias<WriteCvtPD2PSYLd, Zn2WriteCVTPD2PSYLd>; +// z,m512 +defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>; + +// CVTSD2SS. +// x,x. +// Same as WriteCVTPD2PSr +def : SchedAlias<WriteCvtSD2SS, Zn2WriteCVTPD2PSr>; + +// x,m64. +def : SchedAlias<WriteCvtSD2SSLd, Zn2WriteCVTPD2PSLd>; + +// CVTPS2PD. +// x,x. +def Zn2WriteCVTPS2PDr : SchedWriteRes<[Zn2FPU3]> { + let Latency = 3; +} +def : SchedAlias<WriteCvtPS2PD, Zn2WriteCVTPS2PDr>; + +// x,m64. +// y,m128. +def Zn2WriteCVTPS2PDLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 10; + let NumMicroOps = 2; +} +def : SchedAlias<WriteCvtPS2PDLd, Zn2WriteCVTPS2PDLd>; +def : SchedAlias<WriteCvtPS2PDYLd, Zn2WriteCVTPS2PDLd>; +defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>; + +// y,x. +def Zn2WriteVCVTPS2PDY : SchedWriteRes<[Zn2FPU3]> { + let Latency = 3; +} +def : SchedAlias<WriteCvtPS2PDY, Zn2WriteVCVTPS2PDY>; +defm : X86WriteResUnsupported<WriteCvtPS2PDZ>; + +// CVTSS2SD. +// x,x. +def Zn2WriteCVTSS2SDr : SchedWriteRes<[Zn2FPU3]> { + let Latency = 3; +} +def : SchedAlias<WriteCvtSS2SD, Zn2WriteCVTSS2SDr>; + +// x,m32. +def Zn2WriteCVTSS2SDLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : SchedAlias<WriteCvtSS2SDLd, Zn2WriteCVTSS2SDLd>; + +def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> { + let Latency = 3; +} +// CVTDQ2PD. +// x,x. +def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>; + +// Same as xmm +// y,x. +def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>; +def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>; + +def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> { + let Latency = 3; +} +// CVT(T)PD2DQ. +// x,x. +def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)PD2DQrr")>; + +def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> { + let Latency = 10; + let NumMicroOps = 2; +} +// x,m128. +def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>; +// same as xmm handling +// x,y. +def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>; +// x,m256. +def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>; + +def Zn2WriteCVTPS2PIr: SchedWriteRes<[Zn2FPU3]> { + let Latency = 4; +} +// CVT(T)PS2PI. +// mm,x. +def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>; + +// CVTPI2PD. +// x,mm. +def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>; + +// CVT(T)PD2PI. +// mm,x. +def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>; + +def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> { + let Latency = 4; +} + +// same as CVTPD2DQr +// CVT(T)SS2SI. +// r32,x. +def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>; +// same as CVTPD2DQm +// r32,m32. +def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>; + +def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> { + let Latency = 4; +} +// CVTSI2SD. +// x,r32/64. +def : InstRW<[Zn2WriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>; + + +def Zn2WriteCVSTSI2SIr: SchedWriteRes<[Zn2FPU3, Zn2FPU2]> { + let Latency = 4; +} +def Zn2WriteCVSTSI2SILd: SchedWriteRes<[Zn2AGU, Zn2FPU3, Zn2FPU2]> { + let Latency = 11; +} +// CVTSD2SI. +// r32/64 +def : InstRW<[Zn2WriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[Zn2WriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>; + +// VCVTPS2PH. +// x,v,i. +def : SchedAlias<WriteCvtPS2PH, Zn2WriteMicrocoded>; +def : SchedAlias<WriteCvtPS2PHY, Zn2WriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +// m,v,i. +def : SchedAlias<WriteCvtPS2PHSt, Zn2WriteMicrocoded>; +def : SchedAlias<WriteCvtPS2PHYSt, Zn2WriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +// VCVTPH2PS. +// v,x. +def : SchedAlias<WriteCvtPH2PS, Zn2WriteMicrocoded>; +def : SchedAlias<WriteCvtPH2PSY, Zn2WriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZ>; +// v,m. +def : SchedAlias<WriteCvtPH2PSLd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteCvtPH2PSYLd, Zn2WriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>; + +//-- SSE4A instructions --// +// EXTRQ +def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> { + let Latency = 2; +} +def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>; + +// INSERTQ +def Zn2WriteINSERTQ: SchedWriteRes<[Zn2FPU03,Zn2FPU1]> { + let Latency = 4; +} +def : InstRW<[Zn2WriteINSERTQ], (instregex "INSERTQ")>; + +//-- SHA instructions --// +// SHA256MSG2 +def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>; + +// SHA1MSG1, SHA256MSG1 +// x,x. +def Zn2WriteSHA1MSG1r : SchedWriteRes<[Zn2FPU12]> { + let Latency = 2; +} +def : InstRW<[Zn2WriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>; +// x,m. +def Zn2WriteSHA1MSG1Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> { + let Latency = 9; +} +def : InstRW<[Zn2WriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>; + +// SHA1MSG2 +// x,x. +def Zn2WriteSHA1MSG2r : SchedWriteRes<[Zn2FPU12]> ; +def : InstRW<[Zn2WriteSHA1MSG2r], (instregex "SHA1MSG2rr")>; +// x,m. +def Zn2WriteSHA1MSG2Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> { + let Latency = 8; +} +def : InstRW<[Zn2WriteSHA1MSG2Ld], (instregex "SHA1MSG2rm")>; + +// SHA1NEXTE +// x,x. +def Zn2WriteSHA1NEXTEr : SchedWriteRes<[Zn2FPU1]> ; +def : InstRW<[Zn2WriteSHA1NEXTEr], (instregex "SHA1NEXTErr")>; +// x,m. +def Zn2WriteSHA1NEXTELd : SchedWriteRes<[Zn2AGU, Zn2FPU1]> { + let Latency = 8; +} +def : InstRW<[Zn2WriteSHA1NEXTELd], (instregex "SHA1NEXTErm")>; + +// SHA1RNDS4 +// x,x. +def Zn2WriteSHA1RNDS4r : SchedWriteRes<[Zn2FPU1]> { + let Latency = 6; +} +def : InstRW<[Zn2WriteSHA1RNDS4r], (instregex "SHA1RNDS4rr")>; +// x,m. +def Zn2WriteSHA1RNDS4Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> { + let Latency = 13; +} +def : InstRW<[Zn2WriteSHA1RNDS4Ld], (instregex "SHA1RNDS4rm")>; + +// SHA256RNDS2 +// x,x. +def Zn2WriteSHA256RNDS2r : SchedWriteRes<[Zn2FPU1]> { + let Latency = 4; +} +def : InstRW<[Zn2WriteSHA256RNDS2r], (instregex "SHA256RNDS2rr")>; +// x,m. +def Zn2WriteSHA256RNDS2Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> { + let Latency = 11; +} +def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +def : SchedAlias<WriteFHAdd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteFHAddLd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteFHAddY, Zn2WriteMicrocoded>; +def : SchedAlias<WriteFHAddYLd, Zn2WriteMicrocoded>; + +// VDIVPS. +// TODO - convert to Zn2WriteResFpuPair +// y,y,y. +def Zn2WriteVDIVPSYr : SchedWriteRes<[Zn2FPU3]> { + let Latency = 10; + let ResourceCycles = [10]; +} +def : SchedAlias<WriteFDivY, Zn2WriteVDIVPSYr>; + +// y,y,m256. +def Zn2WriteVDIVPSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 17; + let NumMicroOps = 2; + let ResourceCycles = [1, 17]; +} +def : SchedAlias<WriteFDivYLd, Zn2WriteVDIVPSYLd>; + +// VDIVPD. +// TODO - convert to Zn2WriteResFpuPair +// y,y,y. +def Zn2WriteVDIVPDY : SchedWriteRes<[Zn2FPU3]> { + let Latency = 13; + let ResourceCycles = [13]; +} +def : SchedAlias<WriteFDiv64Y, Zn2WriteVDIVPDY>; + +// y,y,m256. +def Zn2WriteVDIVPDYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,20]; +} +def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>; + +// DPPS. +// x,x,i / v,v,v,i. +def : SchedAlias<WriteDPPS, Zn2WriteMicrocoded>; +def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>; + +// x,m,i / v,v,m,i. +def : SchedAlias<WriteDPPSLd, Zn2WriteMicrocoded>; +def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>; + +// DPPD. +// x,x,i. +def : SchedAlias<WriteDPPD, Zn2WriteMicrocoded>; + +// x,m,i. +def : SchedAlias<WriteDPPDLd, Zn2WriteMicrocoded>; + +// RSQRTSS +// TODO - convert to Zn2WriteResFpuPair +// x,x. +def Zn2WriteRSQRTSSr : SchedWriteRes<[Zn2FPU02]> { + let Latency = 5; +} +def : SchedAlias<WriteFRsqrt, Zn2WriteRSQRTSSr>; + +// x,m128. +def Zn2WriteRSQRTSSLd: SchedWriteRes<[Zn2AGU, Zn2FPU02]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; +} +def : SchedAlias<WriteFRsqrtLd, Zn2WriteRSQRTSSLd>; + +// RSQRTPS +// TODO - convert to Zn2WriteResFpuPair +// y,y. +def Zn2WriteRSQRTPSYr : SchedWriteRes<[Zn2FPU01]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : SchedAlias<WriteFRsqrtY, Zn2WriteRSQRTPSYr>; + +// y,m256. +def Zn2WriteRSQRTPSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU01]> { + let Latency = 12; + let NumMicroOps = 2; +} +def : SchedAlias<WriteFRsqrtYLd, Zn2WriteRSQRTPSYLd>; + +//-- Other instructions --// + +// VZEROUPPER. +def : InstRW<[WriteALU], (instrs VZEROUPPER)>; + +// VZEROALL. +def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>; + +} // SchedModel diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index b8980789258e..9aa47c532e82 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -148,8 +148,8 @@ private: /// Manages the predicate state traced through the program. struct PredState { - unsigned InitialReg; - unsigned PoisonReg; + unsigned InitialReg = 0; + unsigned PoisonReg = 0; const TargetRegisterClass *RC; MachineSSAUpdater SSA; @@ -158,10 +158,10 @@ private: : RC(RC), SSA(MF) {} }; - const X86Subtarget *Subtarget; - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; - const TargetRegisterInfo *TRI; + const X86Subtarget *Subtarget = nullptr; + MachineRegisterInfo *MRI = nullptr; + const X86InstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; Optional<PredState> PS; diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index f8f78da52cc2..75c3a70b430a 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -324,8 +324,8 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth) - : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM), - TargetTriple(TT), StackAlignOverride(StackAlignOverride), + : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::Style::None), + TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), @@ -337,15 +337,15 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (!isPositionIndependent()) - setPICStyle(PICStyles::None); + setPICStyle(PICStyles::Style::None); else if (is64Bit()) - setPICStyle(PICStyles::RIPRel); + setPICStyle(PICStyles::Style::RIPRel); else if (isTargetCOFF()) - setPICStyle(PICStyles::None); + setPICStyle(PICStyles::Style::None); else if (isTargetDarwin()) - setPICStyle(PICStyles::StubPIC); + setPICStyle(PICStyles::Style::StubPIC); else if (isTargetELF()) - setPICStyle(PICStyles::GOT); + setPICStyle(PICStyles::Style::GOT); CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering())); Legalizer.reset(new X86LegalizerInfo(*this, TM)); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index e8efe8f2afe5..f4e8d30328ca 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -40,7 +40,7 @@ class GlobalValue; /// namespace PICStyles { -enum Style { +enum class Style { StubPIC, // Used on i386-darwin in pic mode. GOT, // Used on 32 bit elf on when in pic mode. RIPRel, // Used on X86-64 when in pic mode. @@ -56,10 +56,7 @@ public: enum X86ProcFamilyEnum { Others, IntelAtom, - IntelSLM, - IntelGLM, - IntelGLP, - IntelTRM + IntelSLM }; protected: @@ -256,9 +253,9 @@ protected: /// mask over multiple fixed shuffles. bool HasFastVariableShuffle = false; - /// True if there is no performance penalty to writing only the lower parts - /// of a YMM or ZMM register without clearing the upper part. - bool HasFastPartialYMMorZMMWrite = false; + /// True if vzeroupper instructions should be inserted after code that uses + /// ymm or zmm registers. + bool InsertVZEROUPPER = false; /// True if there is no performance penalty for writing NOPs with up to /// 11 bytes. @@ -445,9 +442,15 @@ protected: /// Indicates target prefers 256 bit instructions. bool Prefer256Bit = false; + /// Indicates target prefers AVX512 mask registers. + bool PreferMaskRegisters = false; + /// Threeway branch is profitable in this subtarget. bool ThreewayBranchProfitable = false; + /// Use Goldmont specific floating point div/sqrt costs. + bool UseGLMDivSqrtCosts = false; + /// What processor and OS we're targeting. Triple TargetTriple; @@ -655,9 +658,7 @@ public: bool hasFastVariableShuffle() const { return HasFastVariableShuffle; } - bool hasFastPartialYMMorZMMWrite() const { - return HasFastPartialYMMorZMMWrite; - } + bool insertVZEROUPPER() const { return InsertVZEROUPPER; } bool hasFastGather() const { return HasFastGather; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } @@ -706,6 +707,8 @@ public: return UseRetpolineIndirectBranches; } bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } + bool preferMaskRegisters() const { return PreferMaskRegisters; } + bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } @@ -738,11 +741,6 @@ public: /// TODO: to be removed later and replaced with suitable properties bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } - bool isGLM() const { - return X86ProcFamily == IntelGLM || - X86ProcFamily == IntelGLP || - X86ProcFamily == IntelTRM; - } bool useSoftFloat() const { return UseSoftFloat; } bool useAA() const override { return UseAA; } @@ -801,11 +799,11 @@ public: bool isTargetWin32() const { return !In64BitMode && isOSWindows(); } - bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; } - bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; } + bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; } + bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; } bool isPICStyleStubPIC() const { - return PICStyle == PICStyles::StubPIC; + return PICStyle == PICStyles::Style::StubPIC; } bool isPositionIndependent() const { return TM.isPositionIndependent(); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index c15297134e4d..8c696e9adbed 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -46,6 +46,7 @@ #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/CFGuard.h" #include <memory> #include <string> @@ -60,7 +61,7 @@ static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding", "folding pass"), cl::init(false), cl::Hidden); -extern "C" void LLVMInitializeX86Target() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target()); RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target()); @@ -229,9 +230,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO(); } - // Outlining is available for x86-64. - if (TT.getArch() == Triple::x86_64) - setMachineOutliner(true); + setMachineOutliner(true); initAsmInfo(); } @@ -414,6 +413,16 @@ void X86PassConfig::addIRPasses() { // thunk. These will be a no-op unless a function subtarget has the retpoline // feature enabled. addPass(createIndirectBrExpandPass()); + + // Add Control Flow Guard checks. + const Triple &TT = TM->getTargetTriple(); + if (TT.isOSWindows()) { + if (TT.getArch() == Triple::x86_64) { + addPass(createCFGuardDispatchPass()); + } else { + addPass(createCFGuardCheckPass()); + } + } } bool X86PassConfig::addInstSelector() { @@ -530,6 +539,9 @@ void X86PassConfig::addPreEmitPass2() { (!TT.isOSWindows() || MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI)) addPass(createCFIInstrInserter()); + // Identify valid longjmp targets for Windows Control Flow Guard. + if (TT.isOSWindows()) + addPass(createCFGuardLongjmpPass()); } std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 70fd857fcf01..b754836ea517 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -169,12 +169,13 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, - TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, - TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, - ArrayRef<const Value *> Args) { +int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef<const Value *> Args, + const Instruction *CxtI) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -188,7 +189,7 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::FDIV, MVT::v2f64, 65 }, // divpd }; - if (ST->isGLM()) + if (ST->useGLMDivSqrtCosts()) if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) return LT.first * Entry->Cost; @@ -280,7 +281,7 @@ int X86TTIImpl::getArithmeticInstrCost( TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - if (ISD == ISD::UREM) + else // UREM return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -1389,6 +1390,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, @@ -1397,6 +1399,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, @@ -1550,6 +1553,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB + { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, }; @@ -1576,9 +1580,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, @@ -2199,7 +2208,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, MVT MTy = LT.second; // Attempt to lookup cost. - if (ST->isGLM()) + if (ST->useGLMDivSqrtCosts()) if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -2374,6 +2383,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, } int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { + static const CostTblEntry SLMCostTbl[] = { + { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, + { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, + { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, + { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } + }; + assert(Val->isVectorTy() && "This must be a vector type"); Type *ScalarType = Val->getScalarType(); @@ -2390,9 +2406,22 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { unsigned Width = LT.second.getVectorNumElements(); Index = Index % Width; - // Floating point scalars are already located in index #0. - if (ScalarType->isFloatingPointTy() && Index == 0) - return 0; + if (Index == 0) { + // Floating point scalars are already located in index #0. + if (ScalarType->isFloatingPointTy()) + return 0; + + // Assume movd/movq XMM <-> GPR is relatively cheap on all targets. + if (ScalarType->isIntegerTy()) + return 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Unexpected vector opcode"); + MVT MScalarTy = LT.second.getScalarType(); + if (ST->isSLM()) + if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) + return LT.first * Entry->Cost; } // Add to the base cost if we know that the extracted element of a vector is @@ -2404,8 +2433,9 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } -int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace, const Instruction *I) { +int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + MaybeAlign Alignment, unsigned AddressSpace, + const Instruction *I) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -2456,7 +2486,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask - return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); + return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace); unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = @@ -2474,7 +2504,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore); int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - Alignment, AddressSpace); + MaybeAlign(Alignment), AddressSpace); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } @@ -2533,6 +2563,11 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. + static const CostTblEntry SLMCostTblPairWise[] = { + { ISD::FADD, MVT::v2f64, 3 }, + { ISD::ADD, MVT::v2i64, 5 }, + }; + static const CostTblEntry SSE2CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, @@ -2558,6 +2593,11 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v32i8, 4 }, }; + static const CostTblEntry SLMCostTblNoPairWise[] = { + { ISD::FADD, MVT::v2f64, 3 }, + { ISD::ADD, MVT::v2i64, 5 }, + }; + static const CostTblEntry SSE2CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, @@ -2594,6 +2634,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, if (VT.isSimple()) { MVT MTy = VT.getSimpleVT(); if (IsPairwise) { + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy)) + return Entry->Cost; + if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) return Entry->Cost; @@ -2602,6 +2646,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) return Entry->Cost; } else { + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) return Entry->Cost; @@ -2617,6 +2665,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, MVT MTy = LT.second; if (IsPairwise) { + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; @@ -2625,6 +2677,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; } else { + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; @@ -2634,6 +2690,24 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, return LT.first * Entry->Cost; } + // FIXME: These assume a naive kshift+binop lowering, which is probably + // conservative in most cases. + // FIXME: This doesn't cost large types like v128i1 correctly. + static const CostTblEntry AVX512BoolReduction[] = { + { ISD::AND, MVT::v2i1, 3 }, + { ISD::AND, MVT::v4i1, 5 }, + { ISD::AND, MVT::v8i1, 7 }, + { ISD::AND, MVT::v16i1, 9 }, + { ISD::AND, MVT::v32i1, 11 }, + { ISD::AND, MVT::v64i1, 13 }, + { ISD::OR, MVT::v2i1, 3 }, + { ISD::OR, MVT::v4i1, 5 }, + { ISD::OR, MVT::v8i1, 7 }, + { ISD::OR, MVT::v16i1, 9 }, + { ISD::OR, MVT::v32i1, 11 }, + { ISD::OR, MVT::v64i1, 13 }, + }; + static const CostTblEntry AVX2BoolReduction[] = { { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp @@ -2664,7 +2738,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, }; // Handle bool allof/anyof patterns. - if (ValTy->getVectorElementType()->isIntegerTy(1)) { + if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) { + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) + return LT.first * Entry->Cost; if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) return LT.first * Entry->Cost; @@ -2956,7 +3033,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return std::max(1, Cost); } -int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, +int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); @@ -3053,8 +3130,8 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, return X86TTIImpl::getIntImmCost(Imm, Ty); } -int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty) { +int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -3164,7 +3241,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, ? ST->getGatherOverhead() : ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - Alignment, AddressSpace); + MaybeAlign(Alignment), AddressSpace); } /// Return the cost of full scalarization of gather / scatter operation. @@ -3194,7 +3271,7 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, // The cost of the scalar loads/stores. int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - Alignment, AddressSpace); + MaybeAlign(Alignment), AddressSpace); int InsertExtractCost = 0; if (Opcode == Instruction::Load) @@ -3224,8 +3301,10 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, unsigned AddressSpace = PtrTy->getAddressSpace(); bool Scalarize = false; - if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || - (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + if ((Opcode == Instruction::Load && + !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) || + (Opcode == Instruction::Store && + !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment)))) Scalarize = true; // Gather / Scatter for vector 2 is not profitable on KNL / SKX // Vector-4 of gather/scatter instruction does not exist on KNL. @@ -3348,7 +3427,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { return isLegalMaskedExpandLoad(DataTy); } -bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) { // Some CPUs have better gather performance than others. // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only // enable gather with a -march. @@ -3386,11 +3465,11 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { return IntWidth == 32 || IntWidth == 64; } -bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { // AVX2 doesn't support scatter if (!ST->hasAVX512()) return false; - return isLegalMaskedGather(DataType); + return isLegalMaskedGather(DataType, Alignment); } bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { @@ -3443,10 +3522,9 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { // version is not as fast for three way compare (see #33329). const unsigned PreferredWidth = ST->getPreferVectorWidth(); if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); - if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); + if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); - // All GPR and vector loads can be unaligned. SIMD compare requires integer - // vectors (SSE2/AVX2). + // All GPR and vector loads can be unaligned. Options.AllowOverlappingLoads = true; } if (ST->is64Bit()) { @@ -3520,8 +3598,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, // Get the cost of one memory operation. Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), LegalVT.getVectorNumElements()); - unsigned MemOpCost = - getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, + MaybeAlign(Alignment), AddressSpace); VectorType *VT = VectorType::get(ScalarTy, VF); EVT ETy = TLI->getValueType(DL, VT); @@ -3620,8 +3698,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, // Get the cost of one memory operation. Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), LegalVT.getVectorNumElements()); - unsigned MemOpCost = - getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, + MaybeAlign(Alignment), AddressSpace); unsigned VF = VecTy->getVectorNumElements() / Factor; MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 7581257f41f8..b9c2dbd78058 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -51,7 +51,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { X86::FeatureFastBEXTR, X86::FeatureFastHorizontalOps, X86::FeatureFastLZCNT, - X86::FeatureFastPartialYMMorZMMWrite, X86::FeatureFastScalarFSQRT, X86::FeatureFastSHLDRotate, X86::FeatureFastScalarShiftMasks, @@ -77,6 +76,9 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { X86::FeatureSlowSHLD, X86::FeatureSlowTwoMemOps, X86::FeatureSlowUAMem16, + X86::FeaturePreferMaskRegisters, + X86::FeatureInsertVZEROUPPER, + X86::FeatureUseGLMDivSqrtCosts, // Perf-tuning flags. X86::FeatureHasFastGather, @@ -88,10 +90,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { // CPU name enums. These just follow CPU string. X86::ProcIntelAtom, - X86::ProcIntelGLM, - X86::ProcIntelGLP, X86::ProcIntelSLM, - X86::ProcIntelTRM, }; public: @@ -126,14 +125,15 @@ public: TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef<const Value *> Args = ArrayRef<const Value *>()); + ArrayRef<const Value *> Args = ArrayRef<const Value *>(), + const Instruction *CxtI = nullptr); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); @@ -179,9 +179,9 @@ public: unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); - int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); - int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); @@ -189,8 +189,8 @@ public: bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); - bool isLegalMaskedGather(Type *DataType); - bool isLegalMaskedScatter(Type *DataType); + bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment); + bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment); bool isLegalMaskedExpandLoad(Type *DataType); bool isLegalMaskedCompressStore(Type *DataType); bool hasDivRemOp(Type *DataType, bool IsSigned); diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index 9280d030b5d5..7a8308ef1ba9 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -279,7 +279,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { /// function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite()) + if (!ST.hasAVX() || !ST.insertVZEROUPPER()) return false; TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/llvm/lib/Target/X86/X86WinAllocaExpander.cpp index ae72c6427588..42e8fba2201e 100644 --- a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/llvm/lib/Target/X86/X86WinAllocaExpander.cpp @@ -54,14 +54,14 @@ private: /// Lower a WinAlloca instruction. void lower(MachineInstr* MI, Lowering L); - MachineRegisterInfo *MRI; - const X86Subtarget *STI; - const TargetInstrInfo *TII; - const X86RegisterInfo *TRI; - unsigned StackPtr; - unsigned SlotSize; - int64_t StackProbeSize; - bool NoStackArgProbe; + MachineRegisterInfo *MRI = nullptr; + const X86Subtarget *STI = nullptr; + const TargetInstrInfo *TII = nullptr; + const X86RegisterInfo *TRI = nullptr; + unsigned StackPtr = 0; + unsigned SlotSize = 0; + int64_t StackProbeSize = 0; + bool NoStackArgProbe = false; StringRef getPassName() const override { return "X86 WinAlloca Expander"; } static char ID; diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index d65e1f3ab414..78d3f6460189 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -23,7 +23,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -91,7 +92,7 @@ private: EHPersonality Personality = EHPersonality::Unknown; Function *PersonalityFn = nullptr; bool UseStackGuard = false; - int ParentBaseState; + int ParentBaseState = 0; FunctionCallee SehLongjmpUnwind = nullptr; Constant *Cookie = nullptr; @@ -178,11 +179,6 @@ bool WinEHStatePass::runOnFunction(Function &F) { {Int8PtrType, Type::getInt32Ty(TheModule->getContext())}, /*isVarArg=*/true)); - // Disable frame pointer elimination in this function. - // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we - // use an arbitrary register? - F.addFnAttr("no-frame-pointer-elim", "true"); - emitExceptionRegistrationRecord(&F); // The state numbers calculated here in IR must agree with what we calculate |