diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:01:22 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-04-16 16:01:22 +0000 |
commit | 71d5a2540a98c81f5bcaeb48805e0e2881f530ef (patch) | |
tree | 5343938942df402b49ec7300a1c25a2d4ccd5821 /lib/Target/X86 | |
parent | 31bbf64f3a4974a2d6c8b3b27ad2f519caf74057 (diff) |
Notes
Diffstat (limited to 'lib/Target/X86')
85 files changed, 8510 insertions, 5414 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index c38a7d1dd44df..788fac62626b7 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -1,4 +1,4 @@ -//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===// +//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===// // // The LLVM Compiler Infrastructure // @@ -7,24 +7,31 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/X86MCTargetDesc.h" #include "X86AsmInstrumentation.h" -#include "MCTargetDesc/X86BaseInfo.h" #include "X86Operand.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" #include "llvm/ADT/Triple.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/SMLoc.h" #include <algorithm> #include <cassert> +#include <cstdint> +#include <limits> +#include <memory> #include <vector> // Following comment describes how assembly instrumentation works. @@ -91,30 +98,35 @@ // register as a frame register and temprorary override current CFA // register. -namespace llvm { -namespace { +using namespace llvm; static cl::opt<bool> ClAsanInstrumentAssembly( "asan-instrument-assembly", cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, cl::init(false)); -const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min(); -const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max(); +static const int64_t MinAllowedDisplacement = + std::numeric_limits<int32_t>::min(); +static const int64_t MaxAllowedDisplacement = + std::numeric_limits<int32_t>::max(); -int64_t ApplyDisplacementBounds(int64_t Displacement) { +static int64_t ApplyDisplacementBounds(int64_t Displacement) { return std::max(std::min(MaxAllowedDisplacement, Displacement), MinAllowedDisplacement); } -void CheckDisplacementBounds(int64_t Displacement) { +static void CheckDisplacementBounds(int64_t Displacement) { assert(Displacement >= MinAllowedDisplacement && Displacement <= MaxAllowedDisplacement); } -bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } +static bool IsStackReg(unsigned Reg) { + return Reg == X86::RSP || Reg == X86::ESP; +} -bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } +static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } + +namespace { class X86AddressSanitizer : public X86AsmInstrumentation { public: @@ -178,7 +190,7 @@ public: X86AddressSanitizer(const MCSubtargetInfo *&STI) : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} - ~X86AddressSanitizer() override {} + ~X86AddressSanitizer() override = default; // X86AsmInstrumentation implementation: void InstrumentAndEmitInstruction(const MCInst &Inst, @@ -255,9 +267,11 @@ protected: bool is64BitMode() const { return STI->getFeatureBits()[X86::Mode64Bit]; } + bool is32BitMode() const { return STI->getFeatureBits()[X86::Mode32Bit]; } + bool is16BitMode() const { return STI->getFeatureBits()[X86::Mode16Bit]; } @@ -498,7 +512,7 @@ public: X86AddressSanitizer32(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - ~X86AddressSanitizer32() override {} + ~X86AddressSanitizer32() override = default; unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); @@ -604,9 +618,9 @@ private: EmitInstruction( Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") + (IsWrite ? "store" : "load") + - llvm::Twine(AccessSize)); + Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); @@ -756,7 +770,7 @@ public: X86AddressSanitizer64(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - ~X86AddressSanitizer64() override {} + ~X86AddressSanitizer64() override = default; unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); @@ -875,15 +889,17 @@ private: EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( RegCtx.AddressReg(64))); } - MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") + (IsWrite ? "store" : "load") + - llvm::Twine(AccessSize)); + Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); } }; +} // end anonymous namespace + void X86AddressSanitizer64::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { @@ -1022,12 +1038,10 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, RestoreFlags(Out); } -} // End anonymous namespace - X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) - : STI(STI), InitialFrameReg(0) {} + : STI(STI) {} -X86AsmInstrumentation::~X86AsmInstrumentation() {} +X86AsmInstrumentation::~X86AsmInstrumentation() = default; void X86AsmInstrumentation::InstrumentAndEmitInstruction( const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, @@ -1060,8 +1074,9 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, } X86AsmInstrumentation * -CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo *&STI) { +llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, + const MCContext &Ctx, + const MCSubtargetInfo *&STI) { Triple T(STI->getTargetTriple()); const bool hasCompilerRTSupport = T.isOSLinux(); if (ClAsanInstrumentAssembly && hasCompilerRTSupport && @@ -1073,5 +1088,3 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, } return new X86AsmInstrumentation(STI); } - -} // end llvm namespace diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 470ceadb0aa6b..97a55cd8ad983 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -1,4 +1,4 @@ -//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===// +//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,7 +11,6 @@ #define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H #include "llvm/ADT/SmallVector.h" - #include <memory> namespace llvm { @@ -23,7 +22,6 @@ class MCParsedAsmOperand; class MCStreamer; class MCSubtargetInfo; class MCTargetOptions; - class X86AsmInstrumentation; X86AsmInstrumentation * @@ -43,7 +41,7 @@ public: // Tries to instrument and emit instruction. virtual void InstrumentAndEmitInstruction( const MCInst &Inst, - SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands, + SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); protected: @@ -60,9 +58,9 @@ protected: const MCSubtargetInfo *&STI; - unsigned InitialFrameReg; + unsigned InitialFrameReg = 0; }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index e692118f47fdc..324da650e74e7 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -98,6 +98,14 @@ private: IC_REGISTER }; + enum IntelOperatorKind { + IOK_INVALID = 0, + IOK_LENGTH, + IOK_SIZE, + IOK_TYPE, + IOK_OFFSET + }; + class InfixCalculator { typedef std::pair< InfixCalculatorTok, int64_t > ICToken; SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; @@ -704,10 +712,12 @@ private: std::unique_ptr<X86Operand> ParseIntelOperand(); std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator(); bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); - std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind); + unsigned IdentifyIntelOperator(StringRef Name); + unsigned ParseIntelOperator(unsigned OpKind); std::unique_ptr<X86Operand> ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); + bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); std::unique_ptr<X86Operand> ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp, @@ -814,6 +824,7 @@ private: /// } public: + X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr), @@ -1266,10 +1277,12 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, } } // Remove all the ImmPrefix rewrites within the brackets. + // We may have some Imm rewrties as a result of an operator applying, + // remove them as well for (AsmRewrite &AR : AsmRewrites) { if (AR.Loc.getPointer() < StartInBrac.getPointer()) continue; - if (AR.Kind == AOK_ImmPrefix) + if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) AR.Kind = AOK_Delete; } const char *SymLocPtr = SymName.data(); @@ -1286,6 +1299,30 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, } } +// Some binary bitwise operators have a named synonymous +// Query a candidate string for being such a named operator +// and if so - invoke the appropriate handler +bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) { + // A named operator should be either lower or upper case, but not a mix + if (Name.compare(Name.lower()) && Name.compare(Name.upper())) + return false; + if (Name.equals_lower("not")) + SM.onNot(); + else if (Name.equals_lower("or")) + SM.onOr(); + else if (Name.equals_lower("shl")) + SM.onLShift(); + else if (Name.equals_lower("shr")) + SM.onRShift(); + else if (Name.equals_lower("xor")) + SM.onXor(); + else if (Name.equals_lower("and")) + SM.onAnd(); + else + return false; + return true; +} + bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); @@ -1324,31 +1361,36 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { const MCExpr *Val; SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); + UpdateLocLex = false; if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) { SM.onRegister(TmpReg); - UpdateLocLex = false; - break; + } else if (ParseIntelNamedOperator(Identifier, SM)) { + UpdateLocLex = true; + } else if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) + return Error(Tok.getLoc(), "Unexpected identifier!"); + SM.onIdentifierExpr(Val, Identifier); + } else if (unsigned OpKind = IdentifyIntelOperator(Identifier)) { + if (OpKind == IOK_OFFSET) + return Error(IdentLoc, "Dealing OFFSET operator as part of" + "a compound immediate expression is yet to be supported"); + int64_t Val = ParseIntelOperator(OpKind); + if (!Val) + return true; + StringRef ErrMsg; + if (SM.onInteger(Val, ErrMsg)) + return Error(IdentLoc, ErrMsg); + } else if (Identifier.find('.') != StringRef::npos && + PrevTK == AsmToken::RBrac) { + return false; } else { - if (!isParsingInlineAsm()) { - if (getParser().parsePrimaryExpr(Val, End)) - return Error(Tok.getLoc(), "Unexpected identifier!"); - } else { - // This is a dot operator, not an adjacent identifier. - if (Identifier.find('.') != StringRef::npos && - PrevTK == AsmToken::RBrac) { - return false; - } else { - InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); - if (ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated=*/false, End)) - return true; - } - } + InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); + if (ParseIntelIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) + return true; SM.onIdentifierExpr(Val, Identifier); - UpdateLocLex = false; - break; } - return Error(Tok.getLoc(), "Unexpected identifier!"); + break; } case AsmToken::Integer: { StringRef ErrMsg; @@ -1715,11 +1757,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { OffsetOfLoc, Identifier, Info.OpDecl); } -enum IntelOperatorKind { - IOK_LENGTH, - IOK_SIZE, - IOK_TYPE -}; +// Query a candidate string for being an Intel assembly operator +// Report back its kind, or IOK_INVALID if does not evaluated as a known one +unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) { + return StringSwitch<unsigned>(Name) + .Cases("TYPE","type",IOK_TYPE) + .Cases("SIZE","size",IOK_SIZE) + .Cases("LENGTH","length",IOK_LENGTH) + .Cases("OFFSET","offset",IOK_OFFSET) + .Default(IOK_INVALID); +} /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator /// returns the number of elements in an array. It returns the value 1 for @@ -1727,7 +1774,7 @@ enum IntelOperatorKind { /// variable. A variable's size is the product of its LENGTH and TYPE. The /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. -std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { +unsigned X86AsmParser::ParseIntelOperator(unsigned OpKind) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc TypeLoc = Tok.getLoc(); @@ -1739,11 +1786,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { StringRef Identifier = Tok.getString(); if (ParseIntelIdentifier(Val, Identifier, Info, /*Unevaluated=*/true, End)) - return nullptr; - - if (!Info.OpDecl) - return ErrorOperand(Start, "unable to lookup expression"); + return 0; + if (!Info.OpDecl) { + Error(Start, "unable to lookup expression"); + return 0; + } + unsigned CVal = 0; switch(OpKind) { default: llvm_unreachable("Unexpected operand kind!"); @@ -1757,8 +1806,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { unsigned Len = End.getPointer() - TypeLoc.getPointer(); InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal); - const MCExpr *Imm = MCConstantExpr::create(CVal, getContext()); - return X86Operand::CreateImm(Imm, Start, End); + return CVal; } std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { @@ -1766,18 +1814,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; - // Offset, length, type and size operators. - if (isParsingInlineAsm()) { - StringRef AsmTokStr = Tok.getString(); - if (AsmTokStr == "offset" || AsmTokStr == "OFFSET") + // FIXME: Offset operator + // Should be handled as part of immediate expression, as other operators + // Currently, only supported as a stand-alone operand + if (isParsingInlineAsm()) + if (IdentifyIntelOperator(Tok.getString()) == IOK_OFFSET) return ParseIntelOffsetOfOperator(); - if (AsmTokStr == "length" || AsmTokStr == "LENGTH") - return ParseIntelOperator(IOK_LENGTH); - if (AsmTokStr == "size" || AsmTokStr == "SIZE") - return ParseIntelOperator(IOK_SIZE); - if (AsmTokStr == "type" || AsmTokStr == "TYPE") - return ParseIntelOperator(IOK_TYPE); - } bool PtrInOperand = false; unsigned Size = getIntelMemOperandSize(Tok.getString()); @@ -2360,7 +2402,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Name == "lock" || Name == "rep" || Name == "repe" || Name == "repz" || Name == "repne" || Name == "repnz" || - Name == "rex64" || Name == "data16"; + Name == "rex64" || Name == "data16" || Name == "data32"; bool CurlyAsEndOfStatement = false; // This does the actual operand parsing. Don't parse any more if we have a diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index 9db1a8483bee0..9f1fa6c659070 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -1,4 +1,4 @@ -//===-- X86Operand.h - Parsed X86 machine instruction --------------------===// +//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,12 +11,17 @@ #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H #include "X86AsmParserCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/ADT/STLExtras.h" -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/SMLoc.h" +#include <cassert> +#include <memory> namespace llvm { @@ -74,11 +79,14 @@ struct X86Operand : public MCParsedAsmOperand { /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const override { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. SMLoc getEndLoc() const override { return EndLoc; } + /// getLocRange - Get the range between the first and last token of this /// operand. SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + /// getOffsetOfLoc - Get the location of the offset operator. SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; } @@ -271,6 +279,9 @@ struct X86Operand : public MCParsedAsmOperand { bool isMem256_RC256X() const { return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31); } + bool isMem256_RC512() const { + return isMem256() && isMemIndexReg(X86::ZMM0, X86::ZMM31); + } bool isMem512_RC256X() const { return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31); } @@ -419,10 +430,12 @@ struct X86Operand : public MCParsedAsmOperand { RegNo = getGR32FromGR64(RegNo); Inst.addOperand(MCOperand::createReg(RegNo)); } + void addAVX512RCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); addExpr(Inst, getImm()); } + void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); addExpr(Inst, getImm()); @@ -451,6 +464,7 @@ struct X86Operand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(getMemBaseReg())); Inst.addOperand(MCOperand::createReg(getMemSegReg())); } + void addDstIdxOperands(MCInst &Inst, unsigned N) const { assert((N == 1) && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMemBaseReg())); @@ -541,6 +555,6 @@ struct X86Operand : public MCParsedAsmOperand { } }; -} // End of namespace llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 9dfd09022bdcd..fc4adddc149ba 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -10,11 +10,20 @@ tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel) tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables) +if(LLVM_BUILD_GLOBAL_ISEL) + tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank) + tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) +endif() + add_public_tablegen_target(X86CommonTableGen) # Add GlobalISel files if the build option was enabled. set(GLOBAL_ISEL_FILES X86CallLowering.cpp + X86LegalizerInfo.cpp + X86RegisterBankInfo.cpp + X86InstructionSelector.cpp ) if(LLVM_BUILD_GLOBAL_ISEL) @@ -43,6 +52,7 @@ set(sources X86EvexToVex.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp + X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp X86RegisterInfo.cpp diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 0871888bbfcd6..36ad23bb41c05 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -368,32 +368,49 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, bool isBranch = false; uint64_t pcrel = 0; - if (type == TYPE_RELv) { + if (type == TYPE_REL) { isBranch = true; pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - switch (insn.displacementSize) { + switch (operand.encoding) { default: break; - case 1: + case ENCODING_Iv: + switch (insn.displacementSize) { + default: + break; + case 1: + if(immediate & 0x80) + immediate |= ~(0xffull); + break; + case 2: + if(immediate & 0x8000) + immediate |= ~(0xffffull); + break; + case 4: + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); + break; + case 8: + break; + } + break; + case ENCODING_IB: if(immediate & 0x80) immediate |= ~(0xffull); break; - case 2: + case ENCODING_IW: if(immediate & 0x8000) immediate |= ~(0xffffull); break; - case 4: + case ENCODING_ID: if(immediate & 0x80000000) immediate |= ~(0xffffffffull); break; - case 8: - break; } } // By default sign-extend all X86 immediates based on their encoding. - else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || - type == TYPE_IMM64 || type == TYPE_IMMv) { + else if (type == TYPE_IMM) { switch (operand.encoding) { default: break; @@ -620,38 +637,17 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, } switch (type) { - case TYPE_XMM32: - case TYPE_XMM64: - case TYPE_XMM128: + case TYPE_XMM: mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4))); return; - case TYPE_XMM256: + case TYPE_YMM: mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4))); return; - case TYPE_XMM512: + case TYPE_ZMM: mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4))); return; case TYPE_BNDR: mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4))); - case TYPE_REL8: - isBranch = true; - pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - if (immediate & 0x80) - immediate |= ~(0xffull); - break; - case TYPE_REL16: - isBranch = true; - pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - if (immediate & 0x8000) - immediate |= ~(0xffffull); - break; - case TYPE_REL32: - case TYPE_REL64: - isBranch = true; - pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - if(immediate & 0x80000000) - immediate |= ~(0xffffffffull); - break; default: // operand is 64 bits wide. Do nothing. break; @@ -662,8 +658,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, mcInst, Dis)) mcInst.addOperand(MCOperand::createImm(immediate)); - if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 || - type == TYPE_MOFFS32 || type == TYPE_MOFFS64) { + if (type == TYPE_MOFFS) { MCOperand segmentReg; segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]); mcInst.addOperand(segmentReg); @@ -767,7 +762,27 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, Opcode == X86::VPGATHERDQYrm || Opcode == X86::VPGATHERQQrm || Opcode == X86::VPGATHERDDrm || - Opcode == X86::VPGATHERQDrm); + Opcode == X86::VPGATHERQDrm || + Opcode == X86::VGATHERDPDZ128rm || + Opcode == X86::VGATHERDPDZ256rm || + Opcode == X86::VGATHERDPSZ128rm || + Opcode == X86::VGATHERQPDZ128rm || + Opcode == X86::VGATHERQPSZ128rm || + Opcode == X86::VPGATHERDDZ128rm || + Opcode == X86::VPGATHERDQZ128rm || + Opcode == X86::VPGATHERDQZ256rm || + Opcode == X86::VPGATHERQDZ128rm || + Opcode == X86::VPGATHERQQZ128rm || + Opcode == X86::VSCATTERDPDZ128mr || + Opcode == X86::VSCATTERDPDZ256mr || + Opcode == X86::VSCATTERDPSZ128mr || + Opcode == X86::VSCATTERQPDZ128mr || + Opcode == X86::VSCATTERQPSZ128mr || + Opcode == X86::VPSCATTERDDZ128mr || + Opcode == X86::VPSCATTERDQZ128mr || + Opcode == X86::VPSCATTERDQZ256mr || + Opcode == X86::VPSCATTERQDZ128mr || + Opcode == X86::VPSCATTERQQZ128mr); bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm || Opcode == X86::VGATHERDPSYrm || Opcode == X86::VGATHERQPSYrm || @@ -775,13 +790,49 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, Opcode == X86::VPGATHERDQZrm || Opcode == X86::VPGATHERQQYrm || Opcode == X86::VPGATHERDDYrm || - Opcode == X86::VPGATHERQDYrm); + Opcode == X86::VPGATHERQDYrm || + Opcode == X86::VGATHERDPSZ256rm || + Opcode == X86::VGATHERQPDZ256rm || + Opcode == X86::VGATHERQPSZ256rm || + Opcode == X86::VPGATHERDDZ256rm || + Opcode == X86::VPGATHERQQZ256rm || + Opcode == X86::VPGATHERQDZ256rm || + Opcode == X86::VSCATTERDPDZmr || + Opcode == X86::VPSCATTERDQZmr || + Opcode == X86::VSCATTERDPSZ256mr || + Opcode == X86::VSCATTERQPDZ256mr || + Opcode == X86::VSCATTERQPSZ256mr || + Opcode == X86::VPSCATTERDDZ256mr || + Opcode == X86::VPSCATTERQQZ256mr || + Opcode == X86::VPSCATTERQDZ256mr || + Opcode == X86::VGATHERPF0DPDm || + Opcode == X86::VGATHERPF1DPDm || + Opcode == X86::VSCATTERPF0DPDm || + Opcode == X86::VSCATTERPF1DPDm); bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm || Opcode == X86::VGATHERDPSZrm || Opcode == X86::VGATHERQPSZrm || Opcode == X86::VPGATHERQQZrm || Opcode == X86::VPGATHERDDZrm || - Opcode == X86::VPGATHERQDZrm); + Opcode == X86::VPGATHERQDZrm || + Opcode == X86::VSCATTERQPDZmr || + Opcode == X86::VSCATTERDPSZmr || + Opcode == X86::VSCATTERQPSZmr || + Opcode == X86::VPSCATTERQQZmr || + Opcode == X86::VPSCATTERDDZmr || + Opcode == X86::VPSCATTERQDZmr || + Opcode == X86::VGATHERPF0DPSm || + Opcode == X86::VGATHERPF0QPDm || + Opcode == X86::VGATHERPF0QPSm || + Opcode == X86::VGATHERPF1DPSm || + Opcode == X86::VGATHERPF1QPDm || + Opcode == X86::VGATHERPF1QPSm || + Opcode == X86::VSCATTERPF0DPSm || + Opcode == X86::VSCATTERPF0QPDm || + Opcode == X86::VSCATTERPF0QPSm || + Opcode == X86::VSCATTERPF1DPSm || + Opcode == X86::VSCATTERPF1QPDm || + Opcode == X86::VSCATTERPF1QPSm); if (IndexIs128 || IndexIs256 || IndexIs512) { unsigned IndexOffset = insn.sibIndex - (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); @@ -909,38 +960,15 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_R64: case TYPE_Rv: case TYPE_MM64: - case TYPE_XMM32: - case TYPE_XMM64: - case TYPE_XMM128: - case TYPE_XMM256: - case TYPE_XMM512: - case TYPE_VK1: - case TYPE_VK2: - case TYPE_VK4: - case TYPE_VK8: - case TYPE_VK16: - case TYPE_VK32: - case TYPE_VK64: + case TYPE_XMM: + case TYPE_YMM: + case TYPE_ZMM: + case TYPE_VK: case TYPE_DEBUGREG: case TYPE_CONTROLREG: case TYPE_BNDR: return translateRMRegister(mcInst, insn); case TYPE_M: - case TYPE_M8: - case TYPE_M16: - case TYPE_M32: - case TYPE_M64: - case TYPE_M128: - case TYPE_M256: - case TYPE_M512: - case TYPE_Mv: - case TYPE_M32FP: - case TYPE_M64FP: - case TYPE_M80FP: - case TYPE_M1616: - case TYPE_M1632: - case TYPE_M1664: - case TYPE_LEA: return translateRMMemory(mcInst, insn, Dis); } } @@ -992,6 +1020,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, case ENCODING_WRITEMASK: return translateMaskRegister(mcInst, insn.writemask); CASE_ENCODING_RM: + CASE_ENCODING_VSIB: return translateRM(mcInst, operand, insn, Dis); case ENCODING_IB: case ENCODING_IW: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index ab64d6fcf70bc..b7f637e9a8cd7 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -650,11 +650,6 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->addressSize = (hasAdSize ? 4 : 8); insn->displacementSize = 4; insn->immediateSize = 4; - } else if (insn->rexPrefix) { - insn->registerSize = (hasOpSize ? 2 : 4); - insn->addressSize = (hasAdSize ? 4 : 8); - insn->displacementSize = (hasOpSize ? 2 : 4); - insn->immediateSize = (hasOpSize ? 2 : 4); } else { insn->registerSize = (hasOpSize ? 2 : 4); insn->addressSize = (hasAdSize ? 4 : 8); @@ -1475,21 +1470,13 @@ static int readModRM(struct InternalInstruction* insn) { return prefix##_EAX + index; \ case TYPE_R64: \ return prefix##_RAX + index; \ - case TYPE_XMM512: \ + case TYPE_ZMM: \ return prefix##_ZMM0 + index; \ - case TYPE_XMM256: \ + case TYPE_YMM: \ return prefix##_YMM0 + index; \ - case TYPE_XMM128: \ - case TYPE_XMM64: \ - case TYPE_XMM32: \ + case TYPE_XMM: \ return prefix##_XMM0 + index; \ - case TYPE_VK1: \ - case TYPE_VK2: \ - case TYPE_VK4: \ - case TYPE_VK8: \ - case TYPE_VK16: \ - case TYPE_VK32: \ - case TYPE_VK64: \ + case TYPE_VK: \ if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ @@ -1562,6 +1549,7 @@ static int fixupReg(struct InternalInstruction *insn, return -1; break; CASE_ENCODING_RM: + CASE_ENCODING_VSIB: if (insn->eaBase >= insn->eaRegBase) { insn->eaBase = (EABase)fixupRMValue(insn, (OperandType)op->type, @@ -1753,6 +1741,18 @@ static int readOperands(struct InternalInstruction* insn) { case ENCODING_SI: case ENCODING_DI: break; + CASE_ENCODING_VSIB: + // VSIB can use the V2 bit so check only the other bits. + if (needVVVV) + needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0); + if (readModRM(insn)) + return -1; + if (fixupReg(insn, &Op)) + return -1; + // Apply the AVX512 compressed displacement scaling factor. + if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) + insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB); + break; case ENCODING_REG: CASE_ENCODING_RM: if (readModRM(insn)) @@ -1774,8 +1774,7 @@ static int readOperands(struct InternalInstruction* insn) { } if (readImmediate(insn, 1)) return -1; - if (Op.type == TYPE_XMM128 || - Op.type == TYPE_XMM256) + if (Op.type == TYPE_XMM || Op.type == TYPE_YMM) sawRegImm = 1; break; case ENCODING_IW: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 0a835b876d905..e0f4399b3687e 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -339,6 +339,15 @@ enum ModRMDecisionType { case ENCODING_RM_CD32: \ case ENCODING_RM_CD64 +#define CASE_ENCODING_VSIB \ + case ENCODING_VSIB: \ + case ENCODING_VSIB_CD2: \ + case ENCODING_VSIB_CD4: \ + case ENCODING_VSIB_CD8: \ + case ENCODING_VSIB_CD16: \ + case ENCODING_VSIB_CD32: \ + case ENCODING_VSIB_CD64 + // Physical encodings of instruction operands. #define ENCODINGS \ ENUM_ENTRY(ENCODING_NONE, "") \ @@ -350,6 +359,13 @@ enum ModRMDecisionType { ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16") \ ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32") \ ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64") \ + ENUM_ENTRY(ENCODING_VSIB, "VSIB operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_VSIB_CD2, "VSIB operand with CDisp scaling of 2") \ + ENUM_ENTRY(ENCODING_VSIB_CD4, "VSIB operand with CDisp scaling of 4") \ + ENUM_ENTRY(ENCODING_VSIB_CD8, "VSIB operand with CDisp scaling of 8") \ + ENUM_ENTRY(ENCODING_VSIB_CD16,"VSIB operand with CDisp scaling of 16") \ + ENUM_ENTRY(ENCODING_VSIB_CD32,"VSIB operand with CDisp scaling of 32") \ + ENUM_ENTRY(ENCODING_VSIB_CD64,"VSIB operand with CDisp scaling of 64") \ ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.") \ ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \ @@ -383,85 +399,38 @@ enum OperandEncoding { // Semantic interpretations of instruction operands. #define TYPES \ ENUM_ENTRY(TYPE_NONE, "") \ - ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \ - ENUM_ENTRY(TYPE_REL16, "2-byte") \ - ENUM_ENTRY(TYPE_REL32, "4-byte") \ - ENUM_ENTRY(TYPE_REL64, "8-byte") \ - ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \ - ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \ - ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_REL, "immediate address") \ ENUM_ENTRY(TYPE_R8, "1-byte register operand") \ ENUM_ENTRY(TYPE_R16, "2-byte") \ ENUM_ENTRY(TYPE_R32, "4-byte") \ ENUM_ENTRY(TYPE_R64, "8-byte") \ - ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \ - ENUM_ENTRY(TYPE_IMM16, "2-byte") \ - ENUM_ENTRY(TYPE_IMM32, "4-byte") \ - ENUM_ENTRY(TYPE_IMM64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM, "immediate operand") \ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ ENUM_ENTRY(TYPE_IMM5, "1-byte immediate operand between 0 and 31") \ ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \ ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \ - ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ - ENUM_ENTRY(TYPE_RM16, "2-byte") \ - ENUM_ENTRY(TYPE_RM32, "4-byte") \ - ENUM_ENTRY(TYPE_RM64, "8-byte") \ ENUM_ENTRY(TYPE_M, "Memory operand") \ - ENUM_ENTRY(TYPE_M8, "1-byte") \ - ENUM_ENTRY(TYPE_M16, "2-byte") \ - ENUM_ENTRY(TYPE_M32, "4-byte") \ - ENUM_ENTRY(TYPE_M64, "8-byte") \ - ENUM_ENTRY(TYPE_LEA, "Effective address") \ - ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \ - ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \ - ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \ - ENUM_ENTRY(TYPE_M1632, "2+4-byte") \ - ENUM_ENTRY(TYPE_M1664, "2+8-byte") \ - ENUM_ENTRY(TYPE_SRCIDX8, "1-byte memory at source index") \ - ENUM_ENTRY(TYPE_SRCIDX16, "2-byte memory at source index") \ - ENUM_ENTRY(TYPE_SRCIDX32, "4-byte memory at source index") \ - ENUM_ENTRY(TYPE_SRCIDX64, "8-byte memory at source index") \ - ENUM_ENTRY(TYPE_DSTIDX8, "1-byte memory at destination index") \ - ENUM_ENTRY(TYPE_DSTIDX16, "2-byte memory at destination index") \ - ENUM_ENTRY(TYPE_DSTIDX32, "4-byte memory at destination index") \ - ENUM_ENTRY(TYPE_DSTIDX64, "8-byte memory at destination index") \ - ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \ - "base)") \ - ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \ - ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \ - ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \ - ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \ - ENUM_ENTRY(TYPE_M64FP, "64-bit") \ - ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \ + ENUM_ENTRY(TYPE_SRCIDX, "memory at source index") \ + ENUM_ENTRY(TYPE_DSTIDX, "memory at destination index") \ + ENUM_ENTRY(TYPE_MOFFS, "memory offset (relative to segment base)") \ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \ ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \ - ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ - ENUM_ENTRY(TYPE_XMM64, "8-byte") \ - ENUM_ENTRY(TYPE_XMM128, "16-byte") \ - ENUM_ENTRY(TYPE_XMM256, "32-byte") \ - ENUM_ENTRY(TYPE_XMM512, "64-byte") \ - ENUM_ENTRY(TYPE_VK1, "1-bit") \ - ENUM_ENTRY(TYPE_VK2, "2-bit") \ - ENUM_ENTRY(TYPE_VK4, "4-bit") \ - ENUM_ENTRY(TYPE_VK8, "8-bit") \ - ENUM_ENTRY(TYPE_VK16, "16-bit") \ - ENUM_ENTRY(TYPE_VK32, "32-bit") \ - ENUM_ENTRY(TYPE_VK64, "64-bit") \ + ENUM_ENTRY(TYPE_XMM, "16-byte") \ + ENUM_ENTRY(TYPE_YMM, "32-byte") \ + ENUM_ENTRY(TYPE_ZMM, "64-byte") \ + ENUM_ENTRY(TYPE_VK, "mask register") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ ENUM_ENTRY(TYPE_BNDR, "MPX bounds register") \ \ - ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \ ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \ - ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \ ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \ ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \ ENUM_ENTRY(TYPE_DUP1, "operand 1") \ ENUM_ENTRY(TYPE_DUP2, "operand 2") \ ENUM_ENTRY(TYPE_DUP3, "operand 3") \ ENUM_ENTRY(TYPE_DUP4, "operand 4") \ - ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state") #define ENUM_ENTRY(n, d) n, enum OperandType { diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 10b7e6ff5ee20..6aa7003067440 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -12,19 +12,22 @@ // //===----------------------------------------------------------------------===// -#include "X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" -#include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86ATTInstPrinter.h" #include "X86InstComments.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cinttypes> +#include <cstdint> + using namespace llvm; #define DEBUG_TYPE "asm-printer" @@ -61,6 +64,17 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, OS << "\tcallq\t"; printPCRelImm(MI, 0, OS); } + // data16 and data32 both have the same encoding of 0x66. While data32 is + // valid only in 16 bit systems, data16 is valid in the rest. + // There seems to be some lack of support of the Requires clause that causes + // 0x66 to be interpreted as "data16" by the asm printer. + // Thus we add an adjustment here in order to print the "right" instruction. + else if (MI->getOpcode() == X86::DATA16_PREFIX && + (STI.getFeatureBits()[X86::Mode16Bit])) { + MCInst Data32MI(*MI); + Data32MI.setOpcode(X86::DATA32_PREFIX); + printInstruction(&Data32MI, OS); + } // Try to print any aliases first. else if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); @@ -135,6 +149,7 @@ void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, case 3: O << "{rz-sae}"; break; } } + /// printPCRelImm - This is used to print an immediate value that ends up /// being encoded as a pc-relative value (e.g. for jumps and calls). These /// print slightly differently than normal immediates. For example, a $ is not diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index bbb3090766107..946c1c73f088a 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -1,4 +1,4 @@ -//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=// +//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -137,6 +137,7 @@ public: private: bool HasCustomInstComment; }; -} -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 8594addb5dd41..6e062ec59347b 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -1189,8 +1189,6 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, OS << ']'; --i; // For loop increments element #. } - //MI->print(OS, 0); - OS << "\n"; // We successfully added a comment to this instruction. return true; diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 4443edb8e342b..a8c631ae282f9 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -12,16 +12,18 @@ // //===----------------------------------------------------------------------===// -#include "X86IntelInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" -#include "MCTargetDesc/X86MCTargetDesc.h" #include "X86InstComments.h" +#include "X86IntelInstPrinter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FormattedStream.h" -#include <cctype> +#include <cassert> +#include <cstdint> + using namespace llvm; #define DEBUG_TYPE "asm-printer" diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 20cd7ffb2e638..ace31186a0544 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -157,6 +157,6 @@ public: } }; -} +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index e83ec9f4045ad..a713af6aadb5a 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -109,7 +109,7 @@ public: } void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override { + uint64_t Value, bool IsPCRel, MCContext &Ctx) const override { unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); assert(Fixup.getOffset() + Size <= DataSize && diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index aab552547fac4..d8953da4abb2d 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -212,7 +212,12 @@ namespace X86II { /// the offset from beginning of section. /// /// This is the TLS offset for the COFF/Windows TLS mechanism. - MO_SECREL + MO_SECREL, + + /// MO_ABS8 - On a symbol operand this indicates that the symbol is known + /// to be an absolute symbol in range [0,128), so we can use the @ABS8 + /// symbol modifier. + MO_ABS8, }; enum : uint64_t { diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index da69da51df108..0b73df3a2ff8c 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -13,24 +13,28 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include <cassert> +#include <cstdint> using namespace llvm; namespace { - class X86ELFObjectWriter : public MCELFObjectTargetWriter { - public: - X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine); - ~X86ELFObjectWriter() override; +class X86ELFObjectWriter : public MCELFObjectTargetWriter { +public: + X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine); + ~X86ELFObjectWriter() override = default; - protected: - unsigned getRelocType(MCContext &Ctx, const MCValue &Target, - const MCFixup &Fixup, bool IsPCRel) const override; - }; -} +protected: + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsPCRel) const override; +}; + +} // end anonymous namespace X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) @@ -40,9 +44,6 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, (EMachine != ELF::EM_386) && (EMachine != ELF::EM_IAMCU)) {} -X86ELFObjectWriter::~X86ELFObjectWriter() -{} - enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; static X86_64RelType getType64(unsigned Kind, @@ -96,6 +97,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, default: llvm_unreachable("Unimplemented"); case MCSymbolRefExpr::VK_None: + case MCSymbolRefExpr::VK_X86_ABS8: switch (Type) { case RT64_64: return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; @@ -219,6 +221,7 @@ static unsigned getRelocType32(MCContext &Ctx, default: llvm_unreachable("Unimplemented"); case MCSymbolRefExpr::VK_None: + case MCSymbolRefExpr::VK_X86_ABS8: switch (Type) { case RT32_32: return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32; diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 8045e7c6d8729..10e2bbc64d3cf 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -11,35 +11,43 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <cstdlib> using namespace llvm; #define DEBUG_TYPE "mccodeemitter" namespace { + class X86MCCodeEmitter : public MCCodeEmitter { - X86MCCodeEmitter(const X86MCCodeEmitter &) = delete; - void operator=(const X86MCCodeEmitter &) = delete; const MCInstrInfo &MCII; MCContext &Ctx; + public: X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) : MCII(mcii), Ctx(ctx) { } - - ~X86MCCodeEmitter() override {} + X86MCCodeEmitter(const X86MCCodeEmitter &) = delete; + X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete; + ~X86MCCodeEmitter() override = default; bool is64BitMode(const MCSubtargetInfo &STI) const { return STI.getFeatureBits()[X86::Mode64Bit]; @@ -106,8 +114,7 @@ public: SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const; - inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, - unsigned RM) { + static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) { assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); return RM | (RegOpcode << 3) | (Mod << 6); } @@ -149,12 +156,6 @@ public: } // end anonymous namespace -MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx) { - return new X86MCCodeEmitter(MCII, Ctx); -} - /// isDisp8 - Return true if this signed displacement fits in a 8-bit /// sign-extended field. static bool isDisp8(int Value) { @@ -1436,7 +1437,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: { + case X86II::MRM6r: case X86II::MRM7r: if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). ++CurOp; if (HasEVEX_K) // Skip writemask @@ -1446,13 +1447,12 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r, CurByte, OS); break; - } case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { + case X86II::MRM6m: case X86II::MRM7m: if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). ++CurOp; if (HasEVEX_K) // Skip writemask @@ -1463,7 +1463,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, Rex, CurByte, OS, Fixups, STI); CurOp += X86::AddrNumOperands; break; - } + case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: @@ -1527,3 +1527,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, } #endif } + +MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new X86MCCodeEmitter(MCII, Ctx); +} diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 33376b6d1b906..d6777fc8aa6ae 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -10,6 +10,7 @@ #include "MCTargetDesc/X86FixupKinds.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/COFF.h" @@ -17,28 +18,24 @@ using namespace llvm; -namespace llvm { - class MCObjectWriter; -} - namespace { - class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { - public: - X86WinCOFFObjectWriter(bool Is64Bit); - ~X86WinCOFFObjectWriter() override; - unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsCrossSection, - const MCAsmBackend &MAB) const override; - }; -} +class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { +public: + X86WinCOFFObjectWriter(bool Is64Bit); + ~X86WinCOFFObjectWriter() override = default; + + unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsCrossSection, + const MCAsmBackend &MAB) const override; +}; + +} // end anonymous namespace X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit) : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64 : COFF::IMAGE_FILE_MACHINE_I386) {} -X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {} - unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsCrossSection, diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 2cb80a482d06f..fdcc7e1ab7b05 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -21,7 +21,10 @@ namespace llvm { class FunctionPass; class ImmutablePass; +class InstructionSelector; class PassRegistry; +class X86RegisterBankInfo; +class X86Subtarget; class X86TargetMachine; /// This pass converts a legalized DAG into a X86-specific DAG, ready for @@ -92,6 +95,9 @@ void initializeFixupBWInstPassPass(PassRegistry &); /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); +InstructionSelector *createX86InstructionSelector(X86Subtarget &, + X86RegisterBankInfo &); + void initializeEvexToVexInstPassPass(PassRegistry &); } // End llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 83a23d4ad680e..8fcc8e31d5d44 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -187,8 +187,6 @@ def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", "Support BMI2 instructions">; def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", "Support RTM instructions">; -def FeatureHLE : SubtargetFeature<"hle", "HasHLE", "true", - "Support HLE">; def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", "Support ADX instructions">; def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", @@ -202,6 +200,8 @@ def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", "Support LAHF and SAHF instructions">; def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true", "Enable MONITORX/MWAITX timer functionality">; +def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true", + "Enable Cache Line Zero">; def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", "Support MPX instructions">; def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", @@ -215,18 +215,10 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl", def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; -def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true", - "Invalidate Process-Context Identifier">; -def FeatureVMFUNC : SubtargetFeature<"vmfunc", "HasVMFUNC", "true", - "VM Functions">; -def FeatureSMAP : SubtargetFeature<"smap", "HasSMAP", "true", - "Supervisor Mode Access Protection">; def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", "Enable Software Guard Extensions">; def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true", "Flush A Cache Line Optimized">; -def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true", - "Enable Persistent Commit">; def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true", "Cache Line Write Back">; // TODO: This feature ought to be renamed. @@ -246,11 +238,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; -// On at least some AMD processors, there is no performance hazard to writing -// only the lower parts of a YMM register without clearing the upper part. -def FeatureFastPartialYMMWrite - : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite", - "true", "Partial writes to YMM registers are fast">; +// On some X86 processors, there is no performance hazard to writing only the +// lower parts of a YMM or ZMM register without clearing the upper part. +def FeatureFastPartialYMMorZMMWrite + : SubtargetFeature<"fast-partial-ymm-or-zmm-write", + "HasFastPartialYMMorZMMWrite", + "true", "Partial writes to YMM/ZMM registers are fast">; // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if // vector FSQRT has higher throughput than the corresponding NR code. @@ -271,6 +264,15 @@ def FeatureFastLZCNT "fast-lzcnt", "HasFastLZCNT", "true", "LZCNT instructions are as fast as most simple integer ops">; + +// Sandy Bridge and newer processors can use SHLD with the same source on both +// inputs to implement rotate to avoid the partial flag update of the normal +// rotate instructions. +def FeatureFastSHLDRotate + : SubtargetFeature< + "fast-shld-rotate", "HasFastSHLDRotate", "true", + "SHLD can be used as a faster rotate">; + //===----------------------------------------------------------------------===// // X86 processors supported. //===----------------------------------------------------------------------===// @@ -466,7 +468,8 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, - FeatureFastScalarFSQRT + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate ]>; class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, @@ -498,10 +501,6 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureFMA, FeatureLZCNT, FeatureMOVBE, - FeatureINVPCID, - FeatureVMFUNC, - FeatureRTM, - FeatureHLE, FeatureSlowIncDec ]>; @@ -512,8 +511,7 @@ def : HaswellProc<"core-avx2">; // Legacy alias. def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [ FeatureADX, - FeatureRDSEED, - FeatureSMAP + FeatureRDSEED ]>; class BroadwellProc<string Name> : ProcModel<Name, HaswellModel, BDWFeatures.Value, []>; @@ -521,6 +519,7 @@ def : BroadwellProc<"broadwell">; def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ FeatureMPX, + FeatureRTM, FeatureXSAVEC, FeatureXSAVES, FeatureSGX, @@ -547,7 +546,8 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, FeatureLZCNT, FeatureBMI, FeatureBMI2, - FeatureFMA + FeatureFMA, + FeatureFastPartialYMMorZMMWrite ]>; def : KnightsLandingProc<"knl">; @@ -558,7 +558,6 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [ FeatureBWI, FeatureVLX, FeaturePKU, - FeaturePCOMMIT, FeatureCLWB ]>; @@ -662,7 +661,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [ FeatureXSAVEOPT, FeatureSlowSHLD, FeatureLAHFSAHF, - FeatureFastPartialYMMWrite + FeatureFastPartialYMMorZMMWrite ]>; // Bulldozer @@ -771,6 +770,7 @@ def: ProcessorModel<"znver1", BtVer2Model, [ FeatureBMI, FeatureBMI2, FeatureCLFLUSHOPT, + FeatureCLZERO, FeatureCMPXCHG16B, FeatureF16C, FeatureFMA, @@ -788,7 +788,6 @@ def: ProcessorModel<"znver1", BtVer2Model, [ FeatureRDRAND, FeatureRDSEED, FeatureSHA, - FeatureSMAP, FeatureSSE4A, FeatureSlowSHLD, FeatureX87, @@ -824,6 +823,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, //===----------------------------------------------------------------------===// include "X86RegisterInfo.td" +include "X86RegisterBanks.td" //===----------------------------------------------------------------------===// // Instruction Descriptions diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 6798253d0f6aa..44bc373b0394c 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -81,7 +81,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); - void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerFAULTING_OP(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); @@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL); + void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL); + // Helper function that emits the XRay sleds we've collected for a particular // function. void EmitXRayTable(); diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 78bd2add8c3be..765af67de160a 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -17,22 +17,35 @@ // //===----------------------------------------------------------------------===// -#include <algorithm> - -#include "X86.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "X86FrameLowering.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <iterator> using namespace llvm; @@ -44,6 +57,7 @@ static cl::opt<bool> cl::init(false), cl::Hidden); namespace { + class X86CallFrameOptimization : public MachineFunctionPass { public: X86CallFrameOptimization() : MachineFunctionPass(ID) {} @@ -53,30 +67,28 @@ public: private: // Information we know about a particular call site struct CallContext { - CallContext() - : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0), - MovVector(4, nullptr), NoStackParams(false), UsePush(false) {} + CallContext() : FrameSetup(nullptr), MovVector(4, nullptr) {} // Iterator referring to the frame setup instruction MachineBasicBlock::iterator FrameSetup; // Actual call instruction - MachineInstr *Call; + MachineInstr *Call = nullptr; // A copy of the stack pointer - MachineInstr *SPCopy; + MachineInstr *SPCopy = nullptr; // The total displacement of all passed parameters - int64_t ExpectedDist; + int64_t ExpectedDist = 0; // The sequence of movs used to pass the parameters SmallVector<MachineInstr *, 4> MovVector; // True if this call site has no stack parameters - bool NoStackParams; + bool NoStackParams = false; // True if this call site can use push instructions - bool UsePush; + bool UsePush = false; }; typedef SmallVector<CallContext, 8> ContextVector; @@ -102,7 +114,7 @@ private: StringRef getPassName() const override { return "X86 Optimize Call Frame"; } - const TargetInstrInfo *TII; + const X86InstrInfo *TII; const X86FrameLowering *TFL; const X86Subtarget *STI; MachineRegisterInfo *MRI; @@ -112,11 +124,8 @@ private: }; char X86CallFrameOptimization::ID = 0; -} // end anonymous namespace -FunctionPass *llvm::createX86CallFrameOptimization() { - return new X86CallFrameOptimization(); -} +} // end anonymous namespace // This checks whether the transformation is legal. // Also returns false in cases where it's potentially legal, but @@ -322,7 +331,6 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo()); - unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); @@ -331,8 +339,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, // How much do we adjust the stack? This puts an upper bound on // the number of parameters actually passed on it. - unsigned int MaxAdjust = - FrameSetup->getOperand(0).getImm() >> Log2SlotSize; + unsigned int MaxAdjust = TII->getFrameSize(*FrameSetup) >> Log2SlotSize; // A zero adjustment means no stack parameters if (!MaxAdjust) { @@ -425,7 +432,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, return; Context.Call = &*I; - if ((++I)->getOpcode() != FrameDestroyOpcode) + if ((++I)->getOpcode() != TII->getCallFrameDestroyOpcode()) return; // Now, go through the vector, and see that we don't have any gaps, @@ -455,7 +462,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, // PEI will end up finalizing the handling of this. MachineBasicBlock::iterator FrameSetup = Context.FrameSetup; MachineBasicBlock &MBB = *(FrameSetup->getParent()); - FrameSetup->getOperand(1).setImm(Context.ExpectedDist); + TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist); DebugLoc DL = FrameSetup->getDebugLoc(); bool Is64Bit = STI->is64Bit(); @@ -482,11 +489,10 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, if (isInt<8>(Val)) PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8; } - Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) - .addOperand(PushOp); + Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp); break; case X86::MOV32mr: - case X86::MOV64mr: + case X86::MOV64mr: { unsigned int Reg = PushOp.getReg(); // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg @@ -496,9 +502,9 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, Reg = MRI->createVirtualRegister(&X86::GR64RegClass); BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg); BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg) - .addReg(UndefReg) - .addOperand(PushOp) - .addImm(X86::sub_32bit); + .addReg(UndefReg) + .add(PushOp) + .addImm(X86::sub_32bit); } // If PUSHrmm is not slow on this target, try to fold the source of the @@ -525,6 +531,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, } break; } + } // For debugging, when using SP-based CFA, we need to adjust the CFA // offset after each push. @@ -584,3 +591,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( return &DefMI; } + +FunctionPass *llvm::createX86CallFrameOptimization() { + return new X86CallFrameOptimization(); +} diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index 5ae4962378d34..137ef166aaeb0 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -14,12 +14,20 @@ //===----------------------------------------------------------------------===// #include "X86CallLowering.h" +#include "X86CallingConv.h" #include "X86ISelLowering.h" #include "X86InstrInfo.h" +#include "X86TargetMachine.h" + #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; +#include "X86GenCallingConv.inc" + #ifndef LLVM_BUILD_GLOBAL_ISEL #error "This shouldn't be built without GISel" #endif @@ -27,20 +35,183 @@ using namespace llvm; X86CallLowering::X86CallLowering(const X86TargetLowering &TLI) : CallLowering(&TLI) {} +void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, + SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, + MachineRegisterInfo &MRI, + SplitArgTy PerformArgSplit) const { + + const X86TargetLowering &TLI = *getTLI<X86TargetLowering>(); + LLVMContext &Context = OrigArg.Ty->getContext(); + EVT VT = TLI.getValueType(DL, OrigArg.Ty); + unsigned NumParts = TLI.getNumRegisters(Context, VT); + + if (NumParts == 1) { + // replace the original type ( pointer -> GPR ). + SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context), + OrigArg.Flags, OrigArg.IsFixed); + return; + } + + SmallVector<uint64_t, 4> BitOffsets; + SmallVector<unsigned, 8> SplitRegs; + + EVT PartVT = TLI.getRegisterType(Context, VT); + Type *PartTy = PartVT.getTypeForEVT(Context); + + for (unsigned i = 0; i < NumParts; ++i) { + ArgInfo Info = + ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)), + PartTy, OrigArg.Flags}; + SplitArgs.push_back(Info); + PerformArgSplit(Info.Reg, PartVT.getSizeInBits() * i); + } +} + +namespace { +struct FuncReturnHandler : public CallLowering::ValueHandler { + FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder &MIB, CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + + unsigned getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + llvm_unreachable("Don't know how to get a stack address yet"); + } + + void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + CCValAssign &VA) override { + MIB.addUse(PhysReg, RegState::Implicit); + unsigned ExtReg = extendRegister(ValVReg, VA); + MIRBuilder.buildCopy(PhysReg, ExtReg); + } + + void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + llvm_unreachable("Don't know how to assign a value to an address yet"); + } + + MachineInstrBuilder &MIB; +}; +} // End anonymous namespace. + bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, unsigned VReg) const { - // TODO: handle functions returning non-void values. - if (Val) - return false; - MIRBuilder.buildInstr(X86::RET).addImm(0); + assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg"); + + auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0); + + if (VReg) { + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &DL = MF.getDataLayout(); + const Function &F = *MF.getFunction(); + + ArgInfo OrigArg{VReg, Val->getType()}; + setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); + + SmallVector<ArgInfo, 8> SplitArgs; + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + [&](unsigned Reg, uint64_t Offset) { + MIRBuilder.buildExtract(Reg, VReg, Offset); + }); + FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + } + + MIRBuilder.insertInstr(MIB); return true; } +namespace { +struct FormalArgHandler : public CallLowering::ValueHandler { + FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn, const DataLayout &DL) + : ValueHandler(MIRBuilder, MRI, AssignFn), DL(DL) {} + + unsigned getStackAddress(uint64_t Size, int64_t Offset, + MachinePointerInfo &MPO) override { + + auto &MFI = MIRBuilder.getMF().getFrameInfo(); + int FI = MFI.CreateFixedObject(Size, Offset, true); + MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); + + unsigned AddrReg = MRI.createGenericVirtualRegister( + LLT::pointer(0, DL.getPointerSizeInBits(0))); + MIRBuilder.buildFrameIndex(AddrReg, FI); + return AddrReg; + } + + void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, + MachinePointerInfo &MPO, CCValAssign &VA) override { + + auto MMO = MIRBuilder.getMF().getMachineMemOperand( + MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, + 0); + MIRBuilder.buildLoad(ValVReg, Addr, *MMO); + } + + void assignValueToReg(unsigned ValVReg, unsigned PhysReg, + CCValAssign &VA) override { + MIRBuilder.getMBB().addLiveIn(PhysReg); + MIRBuilder.buildCopy(ValVReg, PhysReg); + } + + const DataLayout &DL; +}; +} // namespace + bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const { - // TODO: handle functions with one or more arguments. - return F.arg_empty(); + if (F.arg_empty()) + return true; + + // TODO: handle variadic function + if (F.isVarArg()) + return false; + + MachineFunction &MF = MIRBuilder.getMF(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto DL = MF.getDataLayout(); + + SmallVector<ArgInfo, 8> SplitArgs; + unsigned Idx = 0; + for (auto &Arg : F.args()) { + ArgInfo OrigArg(VRegs[Idx], Arg.getType()); + setArgFlags(OrigArg, Idx + 1, DL, F); + LLT Ty = MRI.getType(VRegs[Idx]); + unsigned Dst = VRegs[Idx]; + bool Split = false; + splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + [&](unsigned Reg, uint64_t Offset) { + if (!Split) { + Split = true; + Dst = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildUndef(Dst); + } + unsigned Tmp = MRI.createGenericVirtualRegister(Ty); + MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset); + Dst = Tmp; + }); + if (Dst != VRegs[Idx]) + MIRBuilder.buildCopy(VRegs[Idx], Dst); + Idx++; + } + + MachineBasicBlock &MBB = MIRBuilder.getMBB(); + if (!MBB.empty()) + MIRBuilder.setInstr(*MBB.begin()); + + FormalArgHandler Handler(MIRBuilder, MRI, CC_X86, DL); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + + // Move back to the end of the basic block. + MIRBuilder.setMBB(MBB); + + return true; } diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index f2672f09d8558..204e6974c702e 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -34,6 +34,14 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; +private: + /// A function of this type is used to perform value split action. + typedef std::function<void(unsigned, uint64_t)> SplitArgTy; + + void splitToValueTypes(const ArgInfo &OrigArgInfo, + SmallVectorImpl<ArgInfo> &SplitArgs, + const DataLayout &DL, MachineRegisterInfo &MRI, + SplitArgTy SplitArg) const; }; } // End of namespace llvm; #endif diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index cf7bc981b8a58..6781d761a1c4f 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -1074,6 +1074,8 @@ def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs, (sequence "K%u", 0, 7))>; def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>; +def CSR_64_AllRegs_NoSSE : CalleeSavedRegs<(add RAX, RBX, RCX, RDX, RSI, RDI, R8, R9, + R10, R11, R12, R13, R14, R15, RBP)>; def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, (sequence "YMM%u", 0, 15)), (sequence "XMM%u", 0, 15))>; diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp index bdd1ab537bb2e..6472bbbc90169 100755 --- a/lib/Target/X86/X86EvexToVex.cpp +++ b/lib/Target/X86/X86EvexToVex.cpp @@ -20,16 +20,30 @@ //===---------------------------------------------------------------------===// #include "InstPrinter/X86InstComments.h" +#include "MCTargetDesc/X86BaseInfo.h" #include "X86.h" -#include "X86InstrBuilder.h" #include "X86InstrInfo.h" -#include "X86InstrTablesInfo.h" -#include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" -#include "X86TargetMachine.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include <cassert> +#include <cstdint> using namespace llvm; +// Including the generated EVEX2VEX tables. +struct X86EvexToVexCompressTableEntry { + uint16_t EvexOpcode; + uint16_t VexOpcode; +}; +#include "X86GenEVEX2VEXTables.inc" + #define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible" #define EVEX2VEX_NAME "x86-evex-to-vex-compress" @@ -56,8 +70,6 @@ class EvexToVexInstPass : public MachineFunctionPass { public: static char ID; - StringRef getPassName() const override { return EVEX2VEX_DESC; } - EvexToVexInstPass() : MachineFunctionPass(ID) { initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry()); @@ -72,6 +84,8 @@ public: } } + StringRef getPassName() const override { return EVEX2VEX_DESC; } + /// Loop over all of the basic blocks, replacing EVEX instructions /// by equivalent VEX instructions when possible for reducing code size. bool runOnMachineFunction(MachineFunction &MF) override; @@ -88,13 +102,8 @@ private: }; char EvexToVexInstPass::ID = 0; -} -INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) - -FunctionPass *llvm::createX86EvexToVexInsts() { - return new EvexToVexInstPass(); -} +} // end anonymous namespace bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); @@ -125,7 +134,6 @@ void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable, // For EVEX instructions that can be encoded using VEX encoding // replace them by the VEX encoding in order to reduce size. bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { - // VEX format. // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1 // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM] @@ -211,3 +219,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { MI.setAsmPrinterFlag(AC_EVEX_2_VEX); return true; } + +INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) + +FunctionPass *llvm::createX86EvexToVexInsts() { + return new EvexToVexInstPass(); +} diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index 985acf92a2d41..5dfd95f713015 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -77,9 +77,11 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, default: return false; case X86::TCRETURNdi: + case X86::TCRETURNdicc: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: + case X86::TCRETURNdi64cc: case X86::TCRETURNri64: case X86::TCRETURNmi64: { bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; @@ -97,6 +99,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, Offset = StackAdj - MaxTCDelta; assert(Offset >= 0 && "Offset should never be negative"); + if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) { + assert(Offset == 0 && "Conditional tail call cannot adjust the stack."); + } + if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += X86FL->mergeSPUpdates(MBB, MBBI, true); @@ -105,12 +111,22 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Jump to label or value in register. bool IsWin64 = STI->isTargetWin64(); - if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) { + if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc || + Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) { unsigned Op; switch (Opcode) { case X86::TCRETURNdi: Op = X86::TAILJMPd; break; + case X86::TCRETURNdicc: + Op = X86::TAILJMPd_CC; + break; + case X86::TCRETURNdi64cc: + assert(!MBB.getParent()->hasWinCFI() && + "Conditional tail calls confuse " + "the Win64 unwinder."); + Op = X86::TAILJMPd64_CC; + break; default: // Note: Win64 uses REX prefixes indirect jumps out of functions, but // not direct ones. @@ -126,13 +142,17 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); } + if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) { + MIB.addImm(MBBI->getOperand(2).getImm()); + } + } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) { unsigned Op = (Opcode == X86::TCRETURNmi) ? X86::TAILJMPm : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); for (unsigned i = 0; i != 5; ++i) - MIB.addOperand(MBBI->getOperand(i)); + MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { BuildMI(MBB, MBBI, DL, TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) @@ -195,7 +215,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL)); } for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I) - MIB.addOperand(MBBI->getOperand(I)); + MIB.add(MBBI->getOperand(I)); MBB.erase(MBBI); return true; } diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index c890fdd1e5198..036f5d2610e45 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -367,6 +367,10 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: + // TODO: Support this properly. + if (Subtarget->hasAVX512()) + return false; + LLVM_FALLTHROUGH; case MVT::i8: Opc = X86::MOV8rm; RC = &X86::GR8RegClass; @@ -524,6 +528,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { + bool HasSSE1 = Subtarget->hasSSE1(); bool HasSSE2 = Subtarget->hasSSE2(); bool HasSSE4A = Subtarget->hasSSE4A(); bool HasAVX = Subtarget->hasAVX(); @@ -537,6 +542,16 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::f80: // No f80 support yet. default: return false; case MVT::i1: { + // In case ValReg is a K register, COPY to a GPR + if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) { + unsigned KValReg = ValReg; + ValReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ValReg) + .addReg(KValReg); + ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true, + X86::sub_8bit); + } // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -574,6 +589,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, } else Opc = X86::ST_Fp64m; break; + case MVT::x86mmx: + Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr; + break; case MVT::v4f32: if (Aligned) { if (IsNonTemporal) @@ -1268,6 +1286,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; + // In case SrcReg is a K register, COPY to a GPR + if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) { + unsigned KSrcReg = SrcReg; + SrcReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), SrcReg) + .addReg(KSrcReg); + SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, + X86::sub_8bit); + } SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; } @@ -1559,6 +1587,17 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT == MVT::i1) { + // In case ResultReg is a K register, COPY to a GPR + if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) { + unsigned KResultReg = ResultReg; + ResultReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(KResultReg); + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + X86::sub_8bit); + } + // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; @@ -1740,10 +1779,12 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) { unsigned KOpReg = OpReg; - OpReg = createResultReg(&X86::GR8RegClass); + OpReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), OpReg) .addReg(KOpReg); + OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true, + X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg) @@ -2084,10 +2125,12 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { unsigned KCondReg = CondReg; - CondReg = createResultReg(&X86::GR8RegClass); + CondReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); + CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true, + X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)) @@ -2297,10 +2340,12 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { // In case OpReg is a K register, COPY to a GPR if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) { unsigned KCondReg = CondReg; - CondReg = createResultReg(&X86::GR8RegClass); + CondReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CondReg) .addReg(KCondReg, getKillRegState(CondIsKill)); + CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true, + X86::sub_8bit); } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)) @@ -2423,12 +2468,22 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, if (OpReg == 0) return false; + unsigned ImplicitDefReg; + if (Subtarget->hasAVX()) { + ImplicitDefReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); + + } + unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), ResultReg); + if (Subtarget->hasAVX()) - MIB.addReg(OpReg); + MIB.addReg(ImplicitDefReg); + MIB.addReg(OpReg); updateValueMap(I, ResultReg); return true; @@ -2461,7 +2516,8 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { EVT DstVT = TLI.getValueType(DL, I->getType()); // This code only handles truncation to byte. - if (DstVT != MVT::i8 && DstVT != MVT::i1) + // TODO: Support truncate to i1 with AVX512. + if (DstVT != MVT::i8 && (DstVT != MVT::i1 || Subtarget->hasAVX512())) return false; if (!TLI.isTypeLegal(SrcVT)) return false; @@ -3105,8 +3161,8 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, return 0; if (CS) - if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) || - CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU()) + if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) || + CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU()) return 0; return 4; @@ -3266,6 +3322,16 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Handle zero-extension from i1 to i8, which is common. if (ArgVT == MVT::i1) { + // In case SrcReg is a K register, COPY to a GPR + if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) { + unsigned KArgReg = ArgReg; + ArgReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ArgReg) + .addReg(KArgReg); + ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true, + X86::sub_8bit); + } // Set the high bits to zero. ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); ArgVT = MVT::i8; @@ -3463,6 +3529,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; + unsigned SrcReg = VA.getLocReg(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && @@ -3470,9 +3537,19 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { report_fatal_error("SSE register return with SSE disabled"); } + // If the return value is an i1 and AVX-512 is enabled, we need + // to do a fixup to make the copy legal. + if (CopyVT == MVT::i1 && SrcReg == X86::AL && Subtarget->hasAVX512()) { + // Need to copy to a GR32 first. + // TODO: MOVZX isn't great here. We don't care about the upper bits. + SrcReg = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(X86::MOVZX32rr8), SrcReg).addReg(X86::AL); + } + // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. - if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; CopyReg = createResultReg(&X86::RFP80RegClass); @@ -3480,7 +3557,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Copy out the result. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); + TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg); InRegs.push_back(VA.getLocReg()); // Round the f80 to the right size, which also moves it to the appropriate @@ -3601,6 +3678,13 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: + if (Subtarget->hasAVX512()) { + // Need to copy to a VK1 register. + unsigned ResultReg = createResultReg(&X86::VK1RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg); + return ResultReg; + } case MVT::i8: return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, X86::sub_8bit); @@ -3622,7 +3706,12 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { unsigned Opc = 0; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); - case MVT::i1: VT = MVT::i8; LLVM_FALLTHROUGH; + case MVT::i1: + // TODO: Support this properly. + if (Subtarget->hasAVX512()) + return 0; + VT = MVT::i8; + LLVM_FALLTHROUGH; case MVT::i8: Opc = X86::MOV8ri; break; case MVT::i16: Opc = X86::MOV16ri; break; case MVT::i32: Opc = X86::MOV32ri; break; diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index 8bde4bf98d668..c28746f96439b 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -95,10 +95,9 @@ class FixupBWInstPass : public MachineFunctionPass { // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if // possible. Return the replacement instruction if OK, return nullptr - // otherwise. Set WasCandidate to true or false depending on whether the - // MI was a candidate for this sort of transformation. - MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB, - bool &WasCandidate) const; + // otherwise. + MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const; + public: static char ID; @@ -226,7 +225,7 @@ MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode, unsigned NumArgs = MI->getNumOperands(); for (unsigned i = 1; i < NumArgs; ++i) - MIB.addOperand(MI->getOperand(i)); + MIB.add(MI->getOperand(i)); MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); @@ -264,17 +263,13 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const { // Drop imp-defs/uses that would be redundant with the new def/use. for (auto &Op : MI->implicit_operands()) if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg)) - MIB.addOperand(Op); + MIB.add(Op); return MIB; } -MachineInstr *FixupBWInstPass::tryReplaceInstr( - MachineInstr *MI, MachineBasicBlock &MBB, - bool &WasCandidate) const { - MachineInstr *NewMI = nullptr; - WasCandidate = false; - +MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, + MachineBasicBlock &MBB) const { // See if this is an instruction of the type we are currently looking for. switch (MI->getOpcode()) { @@ -282,12 +277,9 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr( // Only replace 8 bit loads with the zero extending versions if // in an inner most loop and not optimizing for size. This takes // an extra byte to encode, and provides limited performance upside. - if (MachineLoop *ML = MLI->getLoopFor(&MBB)) { - if (ML->begin() == ML->end() && !OptForSize) { - NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI); - WasCandidate = true; - } - } + if (MachineLoop *ML = MLI->getLoopFor(&MBB)) + if (ML->begin() == ML->end() && !OptForSize) + return tryReplaceLoad(X86::MOVZX32rm8, MI); break; case X86::MOV16rm: @@ -295,9 +287,7 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr( // Code size is the same, and there is sometimes a perf advantage // from eliminating a false dependence on the upper portion of // the register. - NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI); - WasCandidate = true; - break; + return tryReplaceLoad(X86::MOVZX32rm16, MI); case X86::MOV8rr: case X86::MOV16rr: @@ -305,16 +295,14 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr( // Code size is either less (16) or equal (8), and there is sometimes a // perf advantage from eliminating a false dependence on the upper portion // of the register. - NewMI = tryReplaceCopy(MI); - WasCandidate = true; - break; + return tryReplaceCopy(MI); default: // nothing to do here. break; } - return NewMI; + return nullptr; } void FixupBWInstPass::processBasicBlock(MachineFunction &MF, @@ -338,18 +326,11 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF, // We run after PEI, so we need to AddPristinesAndCSRs. LiveRegs.addLiveOuts(MBB); - bool WasCandidate = false; - for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; - MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate); - - // Add this to replacements if it was a candidate, even if NewMI is - // nullptr. We will revisit that in a bit. - if (WasCandidate) { + if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB)) MIReplacements.push_back(std::make_pair(MI, NewMI)); - } // We're done with this instruction, update liveness for the next one. LiveRegs.stepBackward(*MI); @@ -359,9 +340,7 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF, MachineInstr *MI = MIReplacements.back().first; MachineInstr *NewMI = MIReplacements.back().second; MIReplacements.pop_back(); - if (NewMI) { - MBB.insert(MI, NewMI); - MBB.erase(MI); - } + MBB.insert(MI, NewMI); + MBB.erase(MI); } } diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 12095917ca30c..2cd4c1a3e7b36 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -120,8 +120,8 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, BuildMI(*MF, MI.getDebugLoc(), TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r)) - .addOperand(Dest) - .addOperand(Src) + .add(Dest) + .add(Src) .addImm(1) .addReg(0) .addImm(0) @@ -287,8 +287,8 @@ bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, MachineInstr *NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode)) - .addOperand(MI.getOperand(0)) - .addOperand(MI.getOperand(1)); + .add(MI.getOperand(0)) + .add(MI.getOperand(1)); MFI->erase(I); I = static_cast<MachineBasicBlock::iterator>(NewMI); return true; @@ -377,9 +377,9 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) - .addOperand(Dst) - .addOperand(Src1) - .addOperand(Src2); + .add(Dst) + .add(Src1) + .add(Src2); MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } @@ -387,8 +387,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, if (MI.getOperand(4).getImm() != 0) { const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) - .addOperand(Dst) - .addOperand(SrcR) + .add(Dst) + .add(SrcR) .addImm(MI.getOperand(4).getImm()); MFI->insert(I, NewMI); DEBUG(NewMI->dump();); diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index cd690442bb9f8..78e0bca4158ee 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -252,40 +252,76 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, int64_t NumBytes, bool InEpilogue) const { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; + MachineInstr::MIFlag Flag = + isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); - while (Offset) { - if (Offset > Chunk) { - // Rather than emit a long series of instructions for large offsets, - // load the offset into a register and do one sub/add - unsigned Reg = 0; + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); - if (isSub && !isEAXLiveIn(MBB)) - Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); + if (isSub && !isEAXLiveIn(MBB)) + Reg = Rax; + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + + unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; + unsigned AddSubRROpc = + isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); + if (Reg) { + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg) + .addImm(Offset) + .setMIFlag(Flag); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + return; + } else if (Offset > 8 * Chunk) { + // If we would need more than 8 add or sub instructions (a >16GB stack + // frame), it's worth spilling RAX to materialize this immediate. + // pushq %rax + // movabsq +-$Offset+-SlotSize, %rax + // addq %rsp, %rax + // xchg %rax, (%rsp) + // movq (%rsp), %rsp + assert(Is64Bit && "can't have 32-bit 16GB stack frame"); + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) + .addReg(Rax, RegState::Kill) + .setMIFlag(Flag); + // Subtract is not commutative, so negate the offset and always use add. + // Subtract 8 less and add 8 more to account for the PUSH we just did. + if (isSub) + Offset = -(Offset - SlotSize); else - Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); - - if (Reg) { - unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri; - BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) - .addImm(Offset); - Opc = isSub - ? getSUBrrOpcode(Is64Bit) - : getADDrrOpcode(Is64Bit); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addReg(Reg); - MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. - Offset = 0; - continue; - } + Offset = Offset + SlotSize; + BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax) + .addImm(Offset) + .setMIFlag(Flag); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) + .addReg(Rax) + .addReg(StackPtr); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + // Exchange the new SP in RAX with the top of the stack. + addRegOffset( + BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax), + StackPtr, false, 0); + // Load new SP from the top of the stack into RSP. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr), + StackPtr, false, 0); + return; } + } + while (Offset) { uint64_t ThisVal = std::min(Offset, Chunk); - if (ThisVal == (Is64Bit ? 8 : 4)) { - // Use push / pop instead. + if (ThisVal == SlotSize) { + // Use push / pop for slot sized adjustments as a size optimization. We + // need to find a dead register when using pop. unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); @@ -293,23 +329,16 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, unsigned Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) : (Is64Bit ? X86::POP64r : X86::POP32r); - MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) - .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); - if (isSub) - MI->setFlag(MachineInstr::FrameSetup); - else - MI->setFlag(MachineInstr::FrameDestroy); + BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)) + .setMIFlag(Flag); Offset -= ThisVal; continue; } } - MachineInstrBuilder MI = BuildStackAdjustment( - MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); - if (isSub) - MI.setMIFlag(MachineInstr::FrameSetup); - else - MI.setMIFlag(MachineInstr::FrameDestroy); + BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue) + .setMIFlag(Flag); Offset -= ThisVal; } @@ -959,6 +988,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .getValueAsString() .getAsInteger(0, StackProbeSize); + // Re-align the stack on 64-bit if the x86-interrupt calling convention is + // used and an error code was pushed, since the x86-64 ABI requires a 16-byte + // stack alignment. + if (Fn->getCallingConv() == CallingConv::X86_INTR && Is64Bit && + Fn->arg_size() == 2) { + StackSize += 8; + MFI.setStackSize(StackSize); + emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false); + } + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the @@ -2587,8 +2626,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; - uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; + uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0; + uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0; I = MBB.erase(I); auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index e1b04d6dc3003..863dc8b229688 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -20,6 +20,7 @@ namespace llvm { class MachineInstrBuilder; class MCCFIInstruction; +class X86InstrInfo; class X86Subtarget; class X86RegisterInfo; @@ -30,7 +31,7 @@ public: // Cached subtarget predicates. const X86Subtarget &STI; - const TargetInstrInfo &TII; + const X86InstrInfo &TII; const X86RegisterInfo *TRI; unsigned SlotSize; diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def new file mode 100644 index 0000000000000..06be142432f72 --- /dev/null +++ b/lib/Target/X86/X86GenRegisterBankInfo.def @@ -0,0 +1,104 @@ +//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines all the static objects used by X86RegisterBankInfo. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +#ifdef GET_TARGET_REGBANK_INFO_IMPL +RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{ + /* StartIdx, Length, RegBank */ + // GPR value + {0, 8, X86::GPRRegBank}, // :0 + {0, 16, X86::GPRRegBank}, // :1 + {0, 32, X86::GPRRegBank}, // :2 + {0, 64, X86::GPRRegBank}, // :3 + // FR32/64 , xmm registers + {0, 32, X86::VECRRegBank}, // :4 + {0, 64, X86::VECRRegBank}, // :5 + // VR128/256/512 + {0, 128, X86::VECRRegBank}, // :6 + {0, 256, X86::VECRRegBank}, // :7 + {0, 512, X86::VECRRegBank}, // :8 +}; +#endif // GET_TARGET_REGBANK_INFO_IMPL + +#ifdef GET_TARGET_REGBANK_INFO_CLASS +enum PartialMappingIdx { + PMI_None = -1, + PMI_GPR8, + PMI_GPR16, + PMI_GPR32, + PMI_GPR64, + PMI_FP32, + PMI_FP64, + PMI_VEC128, + PMI_VEC256, + PMI_VEC512 +}; +#endif // GET_TARGET_REGBANK_INFO_CLASS + +#ifdef GET_TARGET_REGBANK_INFO_IMPL +#define INSTR_3OP(INFO) INFO, INFO, INFO, +#define BREAKDOWN(INDEX, NUM) \ + { &X86GenRegisterBankInfo::PartMappings[INDEX], NUM } +// ValueMappings. +RegisterBankInfo::ValueMapping X86GenRegisterBankInfo::ValMappings[]{ + /* BreakDown, NumBreakDowns */ + // 3-operands instructions (all binary operations should end up with one of + // those mapping). + INSTR_3OP(BREAKDOWN(PMI_GPR8, 1)) // 0: GPR_8 + INSTR_3OP(BREAKDOWN(PMI_GPR16, 1)) // 3: GPR_16 + INSTR_3OP(BREAKDOWN(PMI_GPR32, 1)) // 6: GPR_32 + INSTR_3OP(BREAKDOWN(PMI_GPR64, 1)) // 9: GPR_64 + INSTR_3OP(BREAKDOWN(PMI_FP32, 1)) // 12: Fp32 + INSTR_3OP(BREAKDOWN(PMI_FP64, 1)) // 15: Fp64 + INSTR_3OP(BREAKDOWN(PMI_VEC128, 1)) // 18: Vec128 + INSTR_3OP(BREAKDOWN(PMI_VEC256, 1)) // 21: Vec256 + INSTR_3OP(BREAKDOWN(PMI_VEC512, 1)) // 24: Vec512 +}; +#undef INSTR_3OP +#undef BREAKDOWN +#endif // GET_TARGET_REGBANK_INFO_IMPL + +#ifdef GET_TARGET_REGBANK_INFO_CLASS +enum ValueMappingIdx { + VMI_None = -1, + VMI_3OpsGpr8Idx = PMI_GPR8 * 3, + VMI_3OpsGpr16Idx = PMI_GPR16 * 3, + VMI_3OpsGpr32Idx = PMI_GPR32 * 3, + VMI_3OpsGpr64Idx = PMI_GPR64 * 3, + VMI_3OpsFp32Idx = PMI_FP32 * 3, + VMI_3OpsFp64Idx = PMI_FP64 * 3, + VMI_3OpsVec128Idx = PMI_VEC128 * 3, + VMI_3OpsVec256Idx = PMI_VEC256 * 3, + VMI_3OpsVec512Idx = PMI_VEC512 * 3, +}; +#undef GET_TARGET_REGBANK_INFO_CLASS +#endif // GET_TARGET_REGBANK_INFO_CLASS + +#ifdef GET_TARGET_REGBANK_INFO_IMPL +#undef GET_TARGET_REGBANK_INFO_IMPL +const RegisterBankInfo::ValueMapping * +X86GenRegisterBankInfo::getValueMapping(PartialMappingIdx Idx, + unsigned NumOperands) { + + // We can use VMI_3Ops Mapping for all the cases. + if (NumOperands <= 3 && (Idx >= PMI_GPR8 && Idx <= PMI_VEC512)) + return &ValMappings[(unsigned)Idx * 3]; + + llvm_unreachable("Unsupported PartialMappingIdx."); +} + +#endif // GET_TARGET_REGBANK_INFO_IMPL + diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8ab4c0616880c..eb5c56ff2ff91 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -188,7 +188,6 @@ namespace { private: void Select(SDNode *N) override; - bool tryGather(SDNode *N, unsigned Opc); bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); @@ -384,6 +383,16 @@ namespace { bool ComplexPatternFuncMutatesDAG() const override { return true; } + + bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; + + /// Returns whether this is a relocatable immediate in the range + /// [-2^Width .. 2^Width-1]. + template <unsigned Width> bool isSExtRelocImm(SDNode *N) const { + if (auto *CN = dyn_cast<ConstantSDNode>(N)) + return isInt<Width>(CN->getSExtValue()); + return isSExtAbsoluteSymbolRef(Width, N); + } }; } @@ -709,7 +718,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ // For more information see http://people.redhat.com/drepper/tls.pdf if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && - Subtarget->isTargetGlibc()) + (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || + Subtarget->isTargetFuchsia())) switch (N->getPointerInfo().getAddrSpace()) { case 256: AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); @@ -1325,8 +1335,8 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.Scale = 1; // Insert the new nodes into the topological ordering. - insertDAGNode(*CurDAG, N, Zero); - insertDAGNode(*CurDAG, N, Neg); + insertDAGNode(*CurDAG, Handle.getValue(), Zero); + insertDAGNode(*CurDAG, Handle.getValue(), Neg); return false; } @@ -1789,6 +1799,21 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() { return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); } +bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { + if (N->getOpcode() == ISD::TRUNCATE) + N = N->getOperand(0).getNode(); + if (N->getOpcode() != X86ISD::Wrapper) + return false; + + auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0)); + if (!GA) + return false; + + Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); + return CR && CR->getSignedMin().sge(-1ull << Width) && + CR->getSignedMax().slt(1ull << Width); +} + /// Test whether the given X86ISD::CMP node has any uses which require the SF /// or OF bits to be accurate. static bool hasNoSignedComparisonUses(SDNode *N) { @@ -1905,6 +1930,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { ChainCheck = true; + // Drop Load, but keep its chain. No cycle check necessary. + ChainOps.push_back(Load.getOperand(0)); continue; } @@ -1954,39 +1981,6 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { llvm_unreachable("unrecognized size for LdVT"); } -/// Customized ISel for GATHER operations. -bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) { - // Operands of Gather: VSrc, Base, VIdx, VMask, Scale - SDValue Chain = Node->getOperand(0); - SDValue VSrc = Node->getOperand(2); - SDValue Base = Node->getOperand(3); - SDValue VIdx = Node->getOperand(4); - SDValue VMask = Node->getOperand(5); - ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6)); - if (!Scale) - return false; - - SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(), - MVT::Other); - - SDLoc DL(Node); - - // Memory Operands: Base, Scale, Index, Disp, Segment - SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); - SDValue Segment = CurDAG->getRegister(0, MVT::i32); - const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx, - Disp, Segment, VMask, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); - // Node has 2 outputs: VDst and MVT::Other. - // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other. - // We replace VDst of Node with VDst of ResNode, and Other of Node with Other - // of ResNode. - ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0)); - ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2)); - CurDAG->RemoveDeadNode(Node); - return true; -} - void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opc, MOpc; @@ -2024,55 +2018,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } break; } - case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: break; - case Intrinsic::x86_avx2_gather_d_pd: - case Intrinsic::x86_avx2_gather_d_pd_256: - case Intrinsic::x86_avx2_gather_q_pd: - case Intrinsic::x86_avx2_gather_q_pd_256: - case Intrinsic::x86_avx2_gather_d_ps: - case Intrinsic::x86_avx2_gather_d_ps_256: - case Intrinsic::x86_avx2_gather_q_ps: - case Intrinsic::x86_avx2_gather_q_ps_256: - case Intrinsic::x86_avx2_gather_d_q: - case Intrinsic::x86_avx2_gather_d_q_256: - case Intrinsic::x86_avx2_gather_q_q: - case Intrinsic::x86_avx2_gather_q_q_256: - case Intrinsic::x86_avx2_gather_d_d: - case Intrinsic::x86_avx2_gather_d_d_256: - case Intrinsic::x86_avx2_gather_q_d: - case Intrinsic::x86_avx2_gather_q_d_256: { - if (!Subtarget->hasAVX2()) - break; - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); - case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break; - case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break; - case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break; - case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break; - case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break; - case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break; - case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break; - case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break; - case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break; - case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break; - case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break; - case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break; - case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break; - case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break; - case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; - case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; - } - if (tryGather(Node, Opc)) - return; - break; - } - } - break; - } case X86ISD::GlobalBaseReg: ReplaceNode(Node, getGlobalBaseReg()); return; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 08fe2bad281e5..7ff483063ec23 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -53,6 +53,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include <algorithm> #include <bitset> @@ -70,6 +71,13 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); +static cl::opt<int> ExperimentalPrefLoopAlignment( + "x86-experimental-pref-loop-alignment", cl::init(4), + cl::desc("Sets the preferable loop alignment for experiments " + "(the last x86-experimental-pref-loop-alignment bits" + " of the loop header PC will be 0)."), + cl::Hidden); + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -427,7 +435,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } - // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + + // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; @@ -782,6 +791,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); @@ -888,6 +898,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { + setOperationAction(ISD::ABS, MVT::v16i8, Legal); + setOperationAction(ISD::ABS, MVT::v8i16, Legal); + setOperationAction(ISD::ABS, MVT::v4i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); @@ -922,6 +935,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal); + + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal); + for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); @@ -1065,6 +1086,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { + setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); @@ -1126,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } @@ -1271,6 +1293,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } if (Subtarget.hasVLX()) { + setOperationAction(ISD::ABS, MVT::v4i64, Legal); + setOperationAction(ISD::ABS, MVT::v2i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); @@ -1357,16 +1381,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v16i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i64, Legal); - setOperationAction(ISD::ADD, MVT::v8i1, Expand); - setOperationAction(ISD::ADD, MVT::v16i1, Expand); - setOperationAction(ISD::SUB, MVT::v8i1, Expand); - setOperationAction(ISD::SUB, MVT::v16i1, Expand); - setOperationAction(ISD::MUL, MVT::v8i1, Expand); - setOperationAction(ISD::MUL, MVT::v16i1, Expand); + setOperationAction(ISD::ADD, MVT::v8i1, Custom); + setOperationAction(ISD::ADD, MVT::v16i1, Custom); + setOperationAction(ISD::SUB, MVT::v8i1, Custom); + setOperationAction(ISD::SUB, MVT::v16i1, Custom); + setOperationAction(ISD::MUL, MVT::v8i1, Custom); + setOperationAction(ISD::MUL, MVT::v16i1, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); @@ -1441,7 +1466,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Legal); @@ -1460,12 +1485,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - setOperationAction(ISD::ADD, MVT::v32i1, Expand); - setOperationAction(ISD::ADD, MVT::v64i1, Expand); - setOperationAction(ISD::SUB, MVT::v32i1, Expand); - setOperationAction(ISD::SUB, MVT::v64i1, Expand); - setOperationAction(ISD::MUL, MVT::v32i1, Expand); - setOperationAction(ISD::MUL, MVT::v64i1, Expand); + setOperationAction(ISD::ADD, MVT::v32i1, Custom); + setOperationAction(ISD::ADD, MVT::v64i1, Custom); + setOperationAction(ISD::SUB, MVT::v32i1, Custom); + setOperationAction(ISD::SUB, MVT::v64i1, Custom); + setOperationAction(ISD::MUL, MVT::v32i1, Custom); + setOperationAction(ISD::MUL, MVT::v64i1, Custom); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); @@ -1479,8 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); @@ -1546,6 +1571,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); @@ -1574,9 +1600,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v2i1, &X86::VK2RegClass); for (auto VT : { MVT::v2i1, MVT::v4i1 }) { - setOperationAction(ISD::ADD, VT, Expand); - setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); @@ -1671,6 +1697,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::INSERT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); @@ -1696,6 +1723,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG); + setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); @@ -1712,7 +1741,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; - setPrefLoopAlignment(4); // 2^4 bytes. + // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). + setPrefLoopAlignment(ExperimentalPrefLoopAlignment); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. @@ -1933,6 +1963,34 @@ bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } +void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, + ArgListTy &Args) const { + + // Only relabel X86-32 for C / Stdcall CCs. + if (Subtarget.is64Bit()) + return; + if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) + return; + unsigned ParamRegs = 0; + if (auto *M = MF->getFunction()->getParent()) + ParamRegs = M->getNumberRegisterParameters(); + + // Mark the first N int arguments as having reg + for (unsigned Idx = 0; Idx < Args.size(); Idx++) { + Type *T = Args[Idx].Ty; + if (T->isPointerTy() || T->isIntegerTy()) + if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { + unsigned numRegs = 1; + if (MF->getDataLayout().getTypeAllocSize(T) > 4) + numRegs = 2; + if (ParamRegs < numRegs) + return; + ParamRegs -= numRegs; + Args[Idx].IsInReg = true; + } + } +} + const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, @@ -2001,21 +2059,37 @@ unsigned X86TargetLowering::getAddressSpace() const { return 256; } -Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { - // glibc has a special slot for the stack guard in tcbhead_t, use it instead - // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h) - if (!Subtarget.isTargetGlibc()) - return TargetLowering::getIRStackGuard(IRB); - - // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x14 on i386 - unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; - unsigned AddressSpace = getAddressSpace(); +static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { + return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || + (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); +} + +static Constant* SegmentOffset(IRBuilder<> &IRB, + unsigned Offset, unsigned AddressSpace) { return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } +Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { + // glibc, bionic, and Fuchsia have a special slot for the stack guard in + // tcbhead_t; use it instead of the usual global variable (see + // sysdeps/{i386,x86_64}/nptl/tls.h) + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { + if (Subtarget.isTargetFuchsia()) { + // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + return SegmentOffset(IRB, 0x10, getAddressSpace()); + } else { + // %fs:0x28, unless we're using a Kernel code model, in which case + // it's %gs:0x28. gs:0x14 on i386. + unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } + } + + return TargetLowering::getIRStackGuard(IRB); +} + void X86TargetLowering::insertSSPDeclarations(Module &M) const { // MSVC CRT provides functionalities for stack protection. if (Subtarget.getTargetTriple().isOSMSVCRT()) { @@ -2027,13 +2101,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const { auto *SecurityCheckCookie = cast<Function>( M.getOrInsertFunction("__security_check_cookie", Type::getVoidTy(M.getContext()), - Type::getInt8PtrTy(M.getContext()), nullptr)); + Type::getInt8PtrTy(M.getContext()))); SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall); SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg); return; } - // glibc has a special slot for the stack guard. - if (Subtarget.isTargetGlibc()) + // glibc, bionic, and Fuchsia have a special slot for the stack guard. + if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) return; TargetLowering::insertSSPDeclarations(M); } @@ -2056,21 +2130,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (Subtarget.getTargetTriple().isOSContiki()) return getDefaultSafeStackPointerLocation(IRB, false); - if (!Subtarget.isTargetAndroid()) - return TargetLowering::getSafeStackPointerLocation(IRB); - // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h - unsigned AddressSpace, Offset; + if (Subtarget.isTargetAndroid()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + // %gs:0x24 on i386 + unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; + return SegmentOffset(IRB, Offset, getAddressSpace()); + } - // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: - // %gs:0x24 on i386 - Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; - AddressSpace = getAddressSpace(); - return ConstantExpr::getIntToPtr( - ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), - Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); + // Fuchsia is similar. + if (Subtarget.isTargetFuchsia()) { + // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + return SegmentOffset(IRB, 0x18, getAddressSpace()); + } + + return TargetLowering::getSafeStackPointerLocation(IRB); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -2179,6 +2255,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); + + // Add the register to the CalleeSaveDisableRegs list. + if (CallConv == CallingConv::X86_RegCall) + MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); + SDValue ValToCopy = OutVals[OutsIndex]; EVT ValVT = ValToCopy.getValueType(); @@ -2253,6 +2334,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(2 == RegsToPass.size() && "Expecting two registers after Pass64BitArgInRegs"); + + // Add the second register to the CalleeSaveDisableRegs list. + if (CallConv == CallingConv::X86_RegCall) + MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } @@ -2309,6 +2394,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); + + // Add the returned register to the CalleeSaveDisableRegs list. + if (CallConv == CallingConv::X86_RegCall) + MF.getRegInfo().disableCalleeSavedRegister(RetValReg); } const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); @@ -2444,7 +2533,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, // Convert the i32 type into v32i1 type Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); - // Concantenate the two values together + // Concatenate the two values together return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); } @@ -2488,8 +2577,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, SDValue X86TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, + uint32_t *RegMask) const { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget.is64Bit(); @@ -2503,6 +2594,14 @@ SDValue X86TargetLowering::LowerCallResult( CCValAssign &VA = RVLocs[I]; EVT CopyVT = VA.getLocVT(); + // In some calling conventions we need to remove the used registers + // from the register mask. + if (RegMask && CallConv == CallingConv::X86_RegCall) { + for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); + } + // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { @@ -2669,6 +2768,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; + MVT PtrVT = getPointerTy(DAG.getDataLayout()); // If value is passed by pointer we have address passed instead of the value // itself. No need to extend if the mask value and location share the same @@ -2686,13 +2786,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, // taken by a return address. int Offset = 0; if (CallConv == CallingConv::X86_INTR) { - const X86Subtarget& Subtarget = - static_cast<const X86Subtarget&>(DAG.getSubtarget()); // X86 interrupts may take one or two arguments. // On the stack there will be no return address as in regular call. // Offset of last argument need to be set to -4/-8 bytes. // Where offset of the first argument out of two, should be set to 0 bytes. Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + if (Subtarget.is64Bit() && Ins.size() == 2) { + // The stack pointer needs to be realigned for 64 bit handlers with error + // code, so the argument offset changes by 8 bytes. + Offset += 8; + } } // FIXME: For now, all byval parameter objects are marked mutable. This can be @@ -2707,30 +2810,71 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, if (CallConv == CallingConv::X86_INTR) { MFI.setObjectOffset(FI, Offset); } - return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - } else { - int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8, - VA.getLocMemOffset(), isImmutable); - - // Set SExt or ZExt flag. - if (VA.getLocInfo() == CCValAssign::ZExt) { - MFI.setObjectZExt(FI, true); - } else if (VA.getLocInfo() == CCValAssign::SExt) { - MFI.setObjectSExt(FI, true); + return DAG.getFrameIndex(FI, PtrVT); + } + + // This is an argument in memory. We might be able to perform copy elision. + if (Flags.isCopyElisionCandidate()) { + EVT ArgVT = Ins[i].ArgVT; + SDValue PartAddr; + if (Ins[i].PartOffset == 0) { + // If this is a one-part value or the first part of a multi-part value, + // create a stack object for the entire argument value type and return a + // load from our portion of it. This assumes that if the first part of an + // argument is in memory, the rest will also be in memory. + int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), + /*Immutable=*/false); + PartAddr = DAG.getFrameIndex(FI, PtrVT); + return DAG.getLoad( + ValVT, dl, Chain, PartAddr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + } else { + // This is not the first piece of an argument in memory. See if there is + // already a fixed stack object including this offset. If so, assume it + // was created by the PartOffset == 0 branch above and create a load from + // the appropriate offset into it. + int64_t PartBegin = VA.getLocMemOffset(); + int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; + int FI = MFI.getObjectIndexBegin(); + for (; MFI.isFixedObjectIndex(FI); ++FI) { + int64_t ObjBegin = MFI.getObjectOffset(FI); + int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); + if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) + break; + } + if (MFI.isFixedObjectIndex(FI)) { + SDValue Addr = + DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), + DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); + return DAG.getLoad( + ValVT, dl, Chain, Addr, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, + Ins[i].PartOffset)); + } } + } - // Adjust SP offset of interrupt parameter. - if (CallConv == CallingConv::X86_INTR) { - MFI.setObjectOffset(FI, Offset); - } + int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, + VA.getLocMemOffset(), isImmutable); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue Val = DAG.getLoad( - ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); - return ExtendedInMem ? - DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; + // Set SExt or ZExt flag. + if (VA.getLocInfo() == CCValAssign::ZExt) { + MFI.setObjectZExt(FI, true); + } else if (VA.getLocInfo() == CCValAssign::SExt) { + MFI.setObjectSExt(FI, true); + } + + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI.setObjectOffset(FI, Offset); } + + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) + : Val; } // FIXME: Get this from tablegen. @@ -2781,12 +2925,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } +#ifndef NDEBUG static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) { return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), [](const CCValAssign &A, const CCValAssign &B) -> bool { return A.getValNo() < B.getValNo(); }); } +#endif SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -2836,8 +2982,8 @@ SDValue X86TargetLowering::LowerFormalArguments( // The next loop assumes that the locations are in the same order of the // input arguments. - if (!isSortedByValueNo(ArgLocs)) - llvm_unreachable("Argument Location list must be sorted before lowering"); + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); SDValue ArgValue; for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; @@ -2853,7 +2999,7 @@ SDValue X86TargetLowering::LowerFormalArguments( "Currently the only custom case is when we split v64i1 to 2 regs"); // v64i1 values, in regcall calling convention, that are - // compiled to 32 bit arch, are splited up into two registers. + // compiled to 32 bit arch, are split up into two registers. ArgValue = getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); } else { @@ -3107,8 +3253,9 @@ SDValue X86TargetLowering::LowerFormalArguments( MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { - // X86 interrupts must pop the error code if present - FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); + // X86 interrupts must pop the error code (and the alignment padding) if + // present. + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. @@ -3146,6 +3293,12 @@ SDValue X86TargetLowering::LowerFormalArguments( } } + if (CallConv == CallingConv::X86_RegCall) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end())) + MF.getRegInfo().disableCalleeSavedRegister(Pair.first); + } + return Chain; } @@ -3348,8 +3501,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The next loop assumes that the locations are in the same order of the // input arguments. - if (!isSortedByValueNo(ArgLocs)) - llvm_unreachable("Argument Location list must be sorted before lowering"); + assert(isSortedByValueNo(ArgLocs) && + "Argument Location list must be sorted before lowering"); // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. @@ -3517,7 +3670,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (VA.isRegLoc()) { if (VA.needsCustom()) { assert((CallConv == CallingConv::X86_RegCall) && - "Expecting custome case only in regcall calling convention"); + "Expecting custom case only in regcall calling convention"); // This means that we are in special case where one argument was // passed through two register locations - Skip the next location ++I; @@ -3662,7 +3815,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Mask = RegInfo->getNoPreservedMask(); } - Ops.push_back(DAG.getRegisterMask(Mask)); + // Define a new register mask from the existing mask. + uint32_t *RegMask = nullptr; + + // In some calling conventions we need to remove the used physical registers + // from the reg mask. + if (CallConv == CallingConv::X86_RegCall) { + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + // Allocate a new Reg Mask and copy Mask. + RegMask = MF.allocateRegisterMask(TRI->getNumRegs()); + unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32; + memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize); + + // Make sure all sub registers of the argument registers are reset + // in the RegMask. + for (auto const &RegPair : RegsToPass) + for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true); + SubRegs.isValid(); ++SubRegs) + RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32)); + + // Create the RegMask Operand according to our updated mask. + Ops.push_back(DAG.getRegisterMask(RegMask)); + } else { + // Create the RegMask Operand according to the static mask. + Ops.push_back(DAG.getRegisterMask(Mask)); + } if (InFlag.getNode()) Ops.push_back(InFlag); @@ -3715,8 +3893,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, isVarArg, - Ins, dl, DAG, InVals); + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals, RegMask); } //===----------------------------------------------------------------------===// @@ -4132,6 +4310,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { return true; // 'Faux' Target Shuffles. case ISD::AND: + case X86ISD::ANDNP: return true; } } @@ -4448,6 +4627,11 @@ bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } +bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + return true; +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -4460,6 +4644,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { return true; } +MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { + MVT VT = MVT::getIntegerVT(NumBits); + if (isTypeLegal(VT)) + return VT; + + // PMOVMSKB can handle this. + if (NumBits == 128 && isTypeLegal(MVT::v16i8)) + return MVT::v16i8; + + // VPMOVMSKB can handle this. + if (NumBits == 256 && isTypeLegal(MVT::v32i8)) + return MVT::v32i8; + + // TODO: Allow 64-bit type for 32-bit target. + // TODO: 512-bit types should be allowed, but make sure that those + // cases are handled in combineVectorSizedSetCCEquality(). + + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); @@ -4555,28 +4759,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, SmallVectorImpl<int> &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + int M0 = Mask[i]; + int M1 = Mask[i + 1]; + // If both elements are undef, its trivial. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. - if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && - Mask[i + 1] % 2 == 1) { - WidenedMask[i / 2] = Mask[i + 1] / 2; + if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { + WidenedMask[i / 2] = M1 / 2; continue; } - if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { - WidenedMask[i / 2] = Mask[i] / 2; + if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { + WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. - if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { - if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && - (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { + if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && + (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } @@ -4585,9 +4791,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, // Finally check if the two mask values are adjacent and aligned with // a pair. - if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && - Mask[i] + 1 == Mask[i + 1]) { - WidenedMask[i / 2] = Mask[i] / 2; + if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { + WidenedMask[i / 2] = M0 / 2; continue; } @@ -4770,9 +4975,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, return ConstsNode; } -static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs, +static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { - assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays"); + assert(Bits.size() == Undefs.getBitWidth() && + "Unequal constant and undef arrays"); SmallVector<SDValue, 32> Ops; bool Split = false; @@ -4844,10 +5050,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); - // Extract from UNDEF is UNDEF. - if (Vec.isUndef()) - return DAG.getUNDEF(ResultVT); - // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); @@ -4918,50 +5120,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); - - // For insertion into the zero index (low half) of a 256-bit vector, it is - // more efficient to generate a blend with immediate instead of an insert*128. - // We are still creating an INSERT_SUBVECTOR below with an undef node to - // extend the subvector to the size of the result vector. Make sure that - // we are not recursing on that node by checking for undef here. - if (IdxVal == 0 && Result.getValueType().is256BitVector() && - !Result.isUndef()) { - EVT ResultVT = Result.getValueType(); - SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResultVT); - SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, - Vec, ZeroIndex); - - // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); - if (ScalarType.isFloatingPoint()) { - // Choose either vblendps (float) or vblendpd (double). - unsigned ScalarSize = ScalarType.getSizeInBits(); - assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); - unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; - SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); - return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); - } - - const X86Subtarget &Subtarget = - static_cast<const X86Subtarget &>(DAG.getSubtarget()); - - // AVX2 is needed for 256-bit integer blend support. - // Integers must be cast to 32-bit because there is only vpblendd; - // vpblendw can't be used for this because it has a handicapped mask. - - // If we don't have AVX2, then cast to float. Using a wrong domain blend - // is still more efficient than using the wrong domain vinsertf128 that - // will be created by InsertSubVector(). - MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; - - SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); - Result = DAG.getBitcast(CastVT, Result); - Vec256 = DAG.getBitcast(CastVT, Vec256); - Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); - return DAG.getBitcast(ResultVT, Vec256); - } - return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } @@ -5023,7 +5181,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (Vec.isUndef()) { if (IdxVal != 0) { SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits); + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + ShiftBits); } return ExtractSubVec(WideSubVec); } @@ -5032,9 +5191,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; return ExtractSubVec(Vec); } @@ -5043,8 +5202,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), @@ -5056,12 +5215,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); return ExtractSubVec(Vec); } @@ -5094,26 +5253,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT, } /// Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with -/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget, - SelectionDAG &DAG, const SDLoc &dl) { +static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); APInt Ones = APInt::getAllOnesValue(32); unsigned NumElts = VT.getSizeInBits() / 32; - SDValue Vec; - if (!Subtarget.hasInt256() && NumElts == 8) { - Vec = DAG.getConstant(Ones, dl, MVT::v4i32); - Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); - } else { - Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); - } + SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } +static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, + SelectionDAG &DAG) { + EVT InVT = In.getValueType(); + assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); + + if (VT.is128BitVector() && InVT.is128BitVector()) + return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT) + : DAG.getZeroExtendVectorInReg(In, DL, VT); + + // For 256-bit vectors, we only need the lower (128-bit) input half. + // For 512-bit vectors, we only need the lower input half or quarter. + if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) { + int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); + In = extractSubVector(In, 0, DAG, DL, + std::max(128, (int)VT.getSizeInBits() / Scale)); + } + + return DAG.getNode(Opc, DL, VT, In); +} + /// Generate unpacklo/unpackhi shuffle mask. static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo, bool Unary) { @@ -5199,9 +5370,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) { // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, - SmallBitVector &UndefElts, - SmallVectorImpl<APInt> &EltBits) { - assert(UndefElts.empty() && "Expected an empty UndefElts vector"); + APInt &UndefElts, + SmallVectorImpl<APInt> &EltBits, + bool AllowWholeUndefs = true, + bool AllowPartialUndefs = true) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); @@ -5211,56 +5383,83 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"); unsigned NumElts = SizeInBits / EltSizeInBits; + unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); // Split the undef/constant single bitset data into the target elements. auto SplitBitData = [&]() { - UndefElts = SmallBitVector(NumElts, false); + // Don't split if we don't allow undef bits. + bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; + if (UndefBits.getBoolValue() && !AllowUndefs) + return false; + + UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { - APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits); - UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits); + unsigned BitOffset = i * EltSizeInBits; + APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); - // Only treat an element as UNDEF if all bits are UNDEF, otherwise - // treat it as zero. + // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnesValue()) { - UndefElts[i] = true; + if (!AllowWholeUndefs) + return false; + UndefElts.setBit(i); continue; } - APInt Bits = MaskBits.lshr(i * EltSizeInBits); - Bits = Bits.zextOrTrunc(EltSizeInBits); + // If only some bits are UNDEF then treat them as zero (or bail if not + // supported). + if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) + return false; + + APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset); EltBits[i] = Bits.getZExtValue(); } return true; }; - auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask, - APInt &Undefs) { + // Collect constant bits and insert into mask/undef bit masks. + auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, + unsigned BitOffset) { if (!Cst) return false; unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits(); if (isa<UndefValue>(Cst)) { - Mask = APInt::getNullValue(SizeInBits); - Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits); + Undefs.setBits(BitOffset, BitOffset + CstSizeInBits); return true; } if (auto *CInt = dyn_cast<ConstantInt>(Cst)) { - Mask = CInt->getValue().zextOrTrunc(SizeInBits); - Undefs = APInt::getNullValue(SizeInBits); + Mask.insertBits(CInt->getValue(), BitOffset); return true; } if (auto *CFP = dyn_cast<ConstantFP>(Cst)) { - Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits); - Undefs = APInt::getNullValue(SizeInBits); + Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset); return true; } return false; }; + // Extract constant bits from build vector. + if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + const SDValue &Src = Op.getOperand(i); + unsigned BitOffset = i * SrcEltSizeInBits; + if (Src.isUndef()) { + UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); + continue; + } + auto *Cst = cast<ConstantSDNode>(Src); + APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); + MaskBits.insertBits(Bits, BitOffset); + } + return SplitBitData(); + } + // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); @@ -5268,117 +5467,59 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); - for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) { - APInt Bits, Undefs; - if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs)) + for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) + if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits, + i * CstEltSizeInBits)) return false; - MaskBits |= Bits.shl(i * CstEltSizeInBits); - UndefBits |= Undefs.shl(i * CstEltSizeInBits); - } return SplitBitData(); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST && - EltSizeInBits <= Op.getScalarValueSizeInBits()) { + EltSizeInBits <= SrcEltSizeInBits) { if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { - APInt Bits, Undefs; - if (ExtractConstantBits(Broadcast, Bits, Undefs)) { - unsigned NumBroadcastBits = Op.getScalarValueSizeInBits(); - unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits; - for (unsigned i = 0; i != NumBroadcastElts; ++i) { - MaskBits |= Bits.shl(i * NumBroadcastBits); - UndefBits |= Undefs.shl(i * NumBroadcastBits); + APInt Bits(SizeInBits, 0); + APInt Undefs(SizeInBits, 0); + if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) { + for (unsigned i = 0; i != NumSrcElts; ++i) { + MaskBits |= Bits.shl(i * SrcEltSizeInBits); + UndefBits |= Undefs.shl(i * SrcEltSizeInBits); } return SplitBitData(); } } } + // Extract a rematerialized scalar constant insertion. + if (Op.getOpcode() == X86ISD::VZEXT_MOVL && + Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) { + auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0)); + MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits); + MaskBits = MaskBits.zext(SizeInBits); + return SplitBitData(); + } + return false; } -// TODO: Merge more of this with getTargetConstantBitsFromNode. static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl<uint64_t> &RawMask) { - MaskNode = peekThroughBitcasts(MaskNode); - - MVT VT = MaskNode.getSimpleValueType(); - assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); - unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits; - - // Split an APInt element into MaskEltSizeInBits sized pieces and - // insert into the shuffle mask. - auto SplitElementToMask = [&](APInt Element) { - // Note that this is x86 and so always little endian: the low byte is - // the first byte of the mask. - int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits; - for (int i = 0; i < Split; ++i) { - APInt RawElt = Element.getLoBits(MaskEltSizeInBits); - Element = Element.lshr(MaskEltSizeInBits); - RawMask.push_back(RawElt.getZExtValue()); - } - }; - - if (MaskNode.getOpcode() == X86ISD::VBROADCAST) { - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0 - if (VT.getScalarSizeInBits() != MaskEltSizeInBits) - return false; - if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) { - const APInt &MaskElement = CN->getAPIntValue(); - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits); - RawMask.push_back(RawElt.getZExtValue()); - } - } + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + + // Extract the raw target constant bits. + // FIXME: We currently don't support UNDEF bits or mask entries. + if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, + EltBits, /* AllowWholeUndefs */ false, + /* AllowPartialUndefs */ false)) return false; - } - if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL && - MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) { - SDValue MaskOp = MaskNode.getOperand(0).getOperand(0); - if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) { - if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) { - RawMask.push_back(CN->getZExtValue()); - RawMask.append(NumMaskElts - 1, 0); - return true; - } - - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) { - unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits; - SplitElementToMask(CN->getAPIntValue()); - RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0); - return true; - } - } - return false; - } - - if (MaskNode.getOpcode() != ISD::BUILD_VECTOR) - return false; - - // We can always decode if the buildvector is all zero constants, - // but can't use isBuildVectorAllZeros as it might contain UNDEFs. - if (all_of(MaskNode->ops(), X86::isZeroNode)) { - RawMask.append(NumMaskElts, 0); - return true; - } - - // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0 - if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0) - return false; - - for (SDValue Op : MaskNode->ops()) { - if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode())) - SplitElementToMask(CN->getAPIntValue()); - else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode())) - SplitElementToMask(CFN->getValueAPF().bitcastToAPInt()); - else - return false; - } + // Insert the extracted elements into the mask. + for (APInt Elt : EltBits) + RawMask.push_back(Elt.getZExtValue()); return true; } @@ -5405,6 +5546,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); @@ -5473,8 +5615,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, IsUnary = true; break; case X86ISD::VBROADCAST: { - // We only decode broadcasts of same-sized vectors at the moment. - if (N->getOperand(0).getValueType() == VT) { + SDValue N0 = N->getOperand(0); + // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, + // add the pre-extracted value to the Ops vector. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == VT && + N0.getConstantOperandVal(1) == 0) + Ops.push_back(N0.getOperand(0)); + + // We only decode broadcasts of same-sized vectors, unless the broadcast + // came from an extract from the original width. If we found one, we + // pushed it the Ops vector above. + if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(VT, Mask); IsUnary = true; break; @@ -5669,6 +5821,19 @@ static bool setTargetShuffleZeroElements(SDValue N, V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); + assert((VT.getSizeInBits() % Mask.size()) == 0 && + "Illegal split of shuffle value type"); + unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); + + // Extract known constant input data. + APInt UndefSrcElts[2]; + SmallVector<APInt, 32> SrcEltBits[2]; + bool IsSrcConstant[2] = { + getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], + SrcEltBits[0], true, false), + getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], + SrcEltBits[1], true, false)}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; @@ -5677,6 +5842,7 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; // Determine shuffle input and normalize the mask. + unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; @@ -5686,39 +5852,27 @@ static bool setTargetShuffleZeroElements(SDValue N, continue; } - // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. - if (V.getOpcode() != ISD::BUILD_VECTOR) - continue; - - // If the BUILD_VECTOR has fewer elements then the (larger) source - // element must be UNDEF/ZERO. - // TODO: Is it worth testing the individual bits of a constant? - if ((Size % V.getNumOperands()) == 0) { - int Scale = Size / V->getNumOperands(); - SDValue Op = V.getOperand(M / Scale); - if (Op.isUndef()) + // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. + // TODO: We currently only set UNDEF for integer types - floats use the same + // registers as vectors and many of the scalar folded loads rely on the + // SCALAR_TO_VECTOR pattern. + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + (Size % V.getValueType().getVectorNumElements()) == 0) { + int Scale = Size / V.getValueType().getVectorNumElements(); + int Idx = M / Scale; + if (Idx != 0 && !VT.isFloatingPoint()) Mask[i] = SM_SentinelUndef; - else if (X86::isZeroNode(Op)) + else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) Mask[i] = SM_SentinelZero; continue; } - // If the BUILD_VECTOR has more elements then all the (smaller) source - // elements must be all UNDEF or all ZERO. - if ((V.getNumOperands() % Size) == 0) { - int Scale = V->getNumOperands() / Size; - bool AllUndef = true; - bool AllZero = true; - for (int j = 0; j < Scale; ++j) { - SDValue Op = V.getOperand((M * Scale) + j); - AllUndef &= Op.isUndef(); - AllZero &= X86::isZeroNode(Op); - } - if (AllUndef) + // Attempt to extract from the source's constant bits. + if (IsSrcConstant[SrcIdx]) { + if (UndefSrcElts[SrcIdx][M]) Mask[i] = SM_SentinelUndef; - else if (AllZero) + else if (SrcEltBits[SrcIdx][M] == 0) Mask[i] = SM_SentinelZero; - continue; } } @@ -5744,11 +5898,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, unsigned Opcode = N.getOpcode(); switch (Opcode) { - case ISD::AND: { + case ISD::AND: + case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. - SmallBitVector UndefElts; + APInt UndefElts; SmallVector<APInt, 32> EltBits; - if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits)) + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + bool IsAndN = (X86ISD::ANDNP == Opcode); + uint64_t ZeroMask = IsAndN ? 255 : 0; + if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) return false; for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { if (UndefElts[i]) { @@ -5758,9 +5917,55 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, uint64_t ByteBits = EltBits[i].getZExtValue(); if (ByteBits != 0 && ByteBits != 255) return false; - Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i); + Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } - Ops.push_back(N.getOperand(0)); + Ops.push_back(IsAndN ? N1 : N0); + return true; + } + case ISD::SCALAR_TO_VECTOR: { + // Match against a scalar_to_vector of an extract from a similar vector. + SDValue N0 = N.getOperand(0); + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N0.getOperand(0).getValueType() != VT || + !isa<ConstantSDNode>(N0.getOperand(1)) || + NumElts <= N0.getConstantOperandVal(1) || + !N->isOnlyUserOf(N0.getNode())) + return false; + Ops.push_back(N0.getOperand(0)); + Mask.push_back(N0.getConstantOperandVal(1)); + Mask.append(NumElts - 1, SM_SentinelUndef); + return true; + } + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + SDValue InVec = N.getOperand(0); + SDValue InScl = N.getOperand(1); + uint64_t InIdx = N.getConstantOperandVal(2); + assert(InIdx < NumElts && "Illegal insertion index"); + + // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. + if (X86::isZeroNode(InScl)) { + Ops.push_back(InVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); + return true; + } + + // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern. + // TODO: Expand this to support INSERT_VECTOR_ELT/etc. + unsigned ExOp = + (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); + if (InScl.getOpcode() != ISD::AssertZext || + InScl.getOperand(0).getOpcode() != ExOp) + return false; + + SDValue ExVec = InScl.getOperand(0).getOperand(0); + uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1); + assert(ExIdx < NumElts && "Illegal extraction index"); + Ops.push_back(InVec); + Ops.push_back(ExVec); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } case X86ISD::VSHLI: @@ -5795,6 +6000,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, } return true; } + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VZEXT: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); @@ -5810,36 +6016,38 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return false; } +/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly. +static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask) { + int MaskWidth = Mask.size(); + SmallVector<SDValue, 16> UsedInputs; + for (int i = 0, e = Inputs.size(); i < e; ++i) { + int lo = UsedInputs.size() * MaskWidth; + int hi = lo + MaskWidth; + if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { + UsedInputs.push_back(Inputs[i]); + continue; + } + for (int &M : Mask) + if (lo <= M) + M -= MaskWidth; + } + Inputs = UsedInputs; +} + /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the /// remaining input indices in case we now have a unary shuffle and adjust the -/// Op0/Op1 inputs accordingly. +/// inputs accordingly. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, +static bool resolveTargetShuffleInputs(SDValue Op, + SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask) { - SmallVector<SDValue, 2> Ops; - if (!setTargetShuffleZeroElements(Op, Mask, Ops)) - if (!getFauxShuffleMask(Op, Mask, Ops)) + if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) + if (!getFauxShuffleMask(Op, Mask, Inputs)) return false; - int NumElts = Mask.size(); - bool Op0InUse = any_of(Mask, [NumElts](int Idx) { - return 0 <= Idx && Idx < NumElts; - }); - bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; }); - - Op0 = Op0InUse ? Ops[0] : SDValue(); - Op1 = Op1InUse ? Ops[1] : SDValue(); - - // We're only using Op1 - commute the mask and inputs. - if (!Op0InUse && Op1InUse) { - for (int &M : Mask) - if (NumElts <= M) - M -= NumElts; - Op0 = Op1; - Op1 = SDValue(); - } - + resolveTargetShuffleInputsAndMask(Inputs, Mask); return true; } @@ -5914,10 +6122,9 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, - unsigned NumNonZero, unsigned NumZero, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (NumNonZero > 8) return SDValue(); @@ -5928,18 +6135,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) { for (unsigned i = 0; i < 16; ++i) { - bool isNonZero = (NonZeros & (1 << i)) != 0; - if (isNonZero) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v16i8); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v16i8, V); + continue; + } } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, - MVT::v16i8, V, Op.getOperand(i), - DAG.getIntPtrConstant(i, dl)); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } @@ -5958,24 +6173,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { + // FIXME: Investigate extending to i32 instead of just i16. + // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue ThisElt, LastElt; - bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; if (LastIsNonZero) { - LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, - MVT::i16, Op.getOperand(i-1)); + LastElt = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); - ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, - ThisElt, DAG.getConstant(8, dl, MVT::i8)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, + DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; - if (ThisElt.getNode()) - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i/2, dl)); + if (ThisElt) { + if (1 == i) { + V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) + : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + } else { + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i / 2, dl)); + } + } } } @@ -5986,8 +6212,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + const X86Subtarget &Subtarget) { if (NumNonZero > 4) return SDValue(); @@ -5995,18 +6220,26 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { - bool isNonZero = (NonZeros & (1 << i)) != 0; - if (isNonZero) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v8i16); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + continue; + } } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, - MVT::v8i16, V, Op.getOperand(i), - DAG.getIntPtrConstant(i, dl)); + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } @@ -6015,8 +6248,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { + const X86Subtarget &Subtarget) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { @@ -6212,7 +6444,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, /// /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, - SDLoc &DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { unsigned NumElems = Elts.size(); @@ -6376,14 +6608,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, return SDValue(); } -static Constant *getConstantVector(MVT VT, APInt SplatValue, +static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector<Constant *, 32> ConstantVec; for (unsigned i = 0; i < NumElm; i++) { - APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize); + APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); Constant *Const; if (VT.isFloatingPoint()) { assert((ScalarSize == 32 || ScalarSize == 64) && @@ -6664,6 +6896,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); + // Quit if non-constant index. if (!isa<ConstantSDNode>(ExtIdx)) return SDValue(); @@ -6694,11 +6927,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); - for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { - unsigned Idx = InsertIndices[i]; + + for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); - } return NV; } @@ -7347,7 +7579,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, (VT == MVT::v8i32 && Subtarget.hasInt256())) return Op; - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, DAG, DL); } return SDValue(); @@ -7418,7 +7650,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget.is64Bit() && (!IsAllConstants || Idx == 0)) { - if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { + if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); MVT VecVT = MVT::v4i32; @@ -7561,17 +7793,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, - DAG, Subtarget, *this)) + DAG, Subtarget)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, - DAG, Subtarget, *this)) + DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) - if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) + if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. @@ -7767,7 +7999,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); if (V1.isUndef()) - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); if (IsZeroV1) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); @@ -7956,7 +8188,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } -} + } return true; } @@ -7986,6 +8218,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, return true; } +// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle +// mask. +static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask, + const APInt &Zeroable) { + int NumElts = Mask.size(); + assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); + + SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef); + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); + TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); + } + return TargetMask; +} + +// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd +// instructions. +static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) { + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return false; + + SmallVector<int, 8> Unpcklwd; + createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, + /* Unary = */ false); + SmallVector<int, 8> Unpckhwd; + createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, + /* Unary = */ false); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || + isTargetShuffleEquivalent(Mask, Unpckhwd)); + return IsUnpackwdMask; +} + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -8009,7 +8276,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { return Imm; } -static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, +static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } @@ -8022,9 +8289,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); +static APInt computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + APInt Zeroable(Mask.size(), 0); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); @@ -8039,7 +8306,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; + Zeroable.setBit(i); continue; } @@ -8057,17 +8324,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef() || X86::isZeroNode(Op)) - Zeroable[i] = true; + Zeroable.setBit(i); else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { APInt Val = Cst->getAPIntValue(); Val = Val.lshr((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val = Val.lshr((M % Scale) * ScalarSizeInBits); Val = Val.getLoBits(ScalarSizeInBits); - Zeroable[i] = (Val == 0); + if (Val == 0) + Zeroable.setBit(i); } continue; } @@ -8081,7 +8350,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue Op = V.getOperand((M * Scale) + j); AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op)); } - Zeroable[i] = AllZeroable; + if (AllZeroable) + Zeroable.setBit(i); continue; } } @@ -8096,19 +8366,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. -static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, - ArrayRef<int> Mask,const EVT &VectorType, +static bool isNonZeroElementsInOrder(const APInt &Zeroable, + ArrayRef<int> Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. - for (int i = 0, e = Zeroable.size(); i < e; i++) { + for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. - if (Mask[i] == -1) + assert(Mask[i] >= -1 && "Out of bound mask element!"); + if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element - if (NextElement == -1) { + if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } @@ -8124,7 +8395,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable, static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8179,19 +8450,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); -// Function convertBitVectorToUnsigned - The function gets SmallBitVector -// as argument and convert him to unsigned. -// The output of the function is not(zeroable) -static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) { - unsigned convertBit = 0; - for (int i = 0, e = Zeroable.size(); i < e; i++) - convertBit |= !(Zeroable[i]) << i; - return convertBit; -} - // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef<int> Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8199,7 +8460,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); - unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable); + unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); @@ -8215,6 +8476,91 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, ZeroVector); } +static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, + unsigned &UnpackOpcode, bool IsUnary, + ArrayRef<int> TargetMask, SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + int NumElts = VT.getVectorNumElements(); + + bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; + for (int i = 0; i != NumElts; i += 2) { + int M1 = TargetMask[i + 0]; + int M2 = TargetMask[i + 1]; + Undef1 &= (SM_SentinelUndef == M1); + Undef2 &= (SM_SentinelUndef == M2); + Zero1 &= isUndefOrZero(M1); + Zero2 &= isUndefOrZero(M2); + } + assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && + "Zeroable shuffle detected"); + + // Attempt to match the target mask against the unpack lo/hi mask patterns. + SmallVector<int, 64> Unpckl, Unpckh; + createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); + if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { + UnpackOpcode = X86ISD::UNPCKL; + V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); + V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); + return true; + } + + createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); + if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { + UnpackOpcode = X86ISD::UNPCKH; + V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); + V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); + return true; + } + + // If an unary shuffle, attempt to match as an unpack lo/hi with zero. + if (IsUnary && (Zero1 || Zero2)) { + // Don't bother if we can blend instead. + if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && + isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) + return false; + + bool MatchLo = true, MatchHi = true; + for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { + int M = TargetMask[i]; + + // Ignore if the input is known to be zero or the index is undef. + if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || + (M == SM_SentinelUndef)) + continue; + + MatchLo &= (M == Unpckl[i]); + MatchHi &= (M == Unpckh[i]); + } + + if (MatchLo || MatchHi) { + UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; + V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; + return true; + } + } + + // If a binary shuffle, commute and try again. + if (!IsUnary) { + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isTargetShuffleEquivalent(TargetMask, Unpckl)) { + UnpackOpcode = X86ISD::UNPCKL; + std::swap(V1, V2); + return true; + } + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isTargetShuffleEquivalent(TargetMask, Unpckh)) { + UnpackOpcode = X86ISD::UNPCKH; + std::swap(V1, V2); + return true; + } + } + + return false; +} + // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, @@ -8248,13 +8594,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "Floating point types are not supported"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -8286,10 +8631,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); - int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - EltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector<SDValue, 16> MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) @@ -8307,51 +8650,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::OR, DL, VT, V1, V2); } -/// \brief Try to emit a blend instruction for a shuffle. -/// -/// This doesn't do any checks for the availability of instructions for blending -/// these values. It relies on the availability of the X86ISD::BLENDI pattern to -/// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is a blend, or convertible into a blend with zero. -static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Original, - const SmallBitVector &Zeroable, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - SmallVector<int, 8> Mask(Original.begin(), Original.end()); - bool ForceV1Zero = false, ForceV2Zero = false; +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget &Subtarget, + SelectionDAG &DAG); + +static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, + MutableArrayRef<int> TargetMask, + bool &ForceV1Zero, bool &ForceV2Zero, + uint64_t &BlendMask) { + bool V1IsZeroOrUndef = + V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZeroOrUndef = + V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); + + BlendMask = 0; + ForceV1Zero = false, ForceV2Zero = false; + assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. - unsigned BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - if (M < 0) + for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { + int M = TargetMask[i]; + if (M == SM_SentinelUndef) continue; if (M == i) continue; if (M == i + Size) { - BlendMask |= 1u << i; + BlendMask |= 1ull << i; continue; } - if (Zeroable[i]) { - if (V1IsZero) { + if (M == SM_SentinelZero) { + if (V1IsZeroOrUndef) { ForceV1Zero = true; - Mask[i] = i; + TargetMask[i] = i; continue; } - if (V2IsZero) { + if (V2IsZeroOrUndef) { ForceV2Zero = true; - BlendMask |= 1u << i; - Mask[i] = i + Size; + BlendMask |= 1ull << i; + TargetMask[i] = i + Size; continue; } } - return SDValue(); // Shuffled input! + return false; } + return true; +} + +uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { + uint64_t ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1ull << i)) + ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); + return ScaledMask; +} + +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is a blend, or convertible into a blend with zero. +static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Original, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); + + uint64_t BlendMask = 0; + bool ForceV1Zero = false, ForceV2Zero = false; + if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, + BlendMask)) + return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) @@ -8359,15 +8732,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); - auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { - unsigned ScaledMask = 0; - for (int i = 0; i != Size; ++i) - if (BlendMask & (1u << i)) - for (int j = 0; j != Scale; ++j) - ScaledMask |= 1u << (i * Scale + j); - return ScaledMask; - }; - switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: @@ -8387,7 +8751,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); @@ -8400,7 +8764,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); - BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, @@ -8417,7 +8781,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) - BlendMask |= 1u << i; + BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } @@ -8428,6 +8792,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + if (Subtarget.hasBWI() && Subtarget.hasVLX()) { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); + } + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG)) @@ -8465,7 +8836,17 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } - + case MVT::v16f32: + case MVT::v8f64: + case MVT::v8i64: + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); + } default: llvm_unreachable("Not a supported integer vector type!"); } @@ -8503,7 +8884,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } -/// \brief Generic routine to decompose a shuffle and blend into indepndent +/// \brief Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined @@ -8757,7 +9138,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef<int> Mask, int MaskOffset, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; @@ -8819,7 +9200,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); @@ -8855,12 +9236,12 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); - assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) @@ -8987,7 +9368,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and -/// begin and can start from an offseted element index in the input; to +/// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. @@ -9027,21 +9408,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { - // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - - // For 256-bit vectors, we only need the lower (128-bit) input half. - // For 512-bit vectors, we only need the lower input half or quarter. - if (VT.getSizeInBits() > 128) - InputV = extractSubVector(InputV, 0, DAG, DL, - std::max(128, (int)VT.getSizeInBits() / Scale)); - - InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); + InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -9158,7 +9532,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; @@ -9314,7 +9688,7 @@ static bool isShuffleFoldableLoad(SDValue V) { /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, const X86Subtarget &Subtarget, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); @@ -9612,7 +9986,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, if (((BroadcastIdx * EltSize) % 128) != 0) return SDValue(); - MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize); + // The shuffle input might have been a bitcast we looked through; look at + // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll + // later bitcast it to BroadcastVT. + MVT SrcVT = V.getSimpleValueType(); + assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + "Unexpected vector element size"); + assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) && + "Unexpected vector size"); + + MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize); V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, DAG.getIntPtrConstant(BroadcastIdx, DL)); } @@ -9642,6 +10025,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } + // We only support broadcasting from 128-bit vectors to minimize the + // number of patterns we need to deal with in isel. So extract down to + // 128-bits. + if (SrcVT.getSizeInBits() > 128) + V = extract128BitVector(V, 0, DAG, DL); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } @@ -9653,7 +10042,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // elements are zeroable. static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); @@ -9742,7 +10131,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2, static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); @@ -9877,7 +10266,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -9959,7 +10348,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10178,7 +10567,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT, /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10261,7 +10650,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -10353,7 +10742,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build - // up the inputs, bypassing domain shift penalties that we would encur if we + // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); @@ -10384,18 +10773,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); - assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); + assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef<int> LoMask = Mask.slice(0, 4); MutableArrayRef<int> HiMask = Mask.slice(4, 4); SmallVector<int, 4> LoInputs; - std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), - [](int M) { return M >= 0; }); + copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector<int, 4> HiInputs; - std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), - [](int M) { return M >= 0; }); + copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = @@ -10574,7 +10961,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); - else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from @@ -10830,7 +11217,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( /// blend if only one input is used. static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse, + const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SDValue V1Mask[16]; SDValue V2Mask[16]; @@ -10891,7 +11278,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs( /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11075,7 +11462,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11132,14 +11519,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector<int, 4> LoInputs; - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), - [](int M) { return M >= 0 && M < 8; }); + copy_if(Mask, std::back_inserter(LoInputs), + [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector<int, 4> HiInputs; - std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), - [](int M) { return M >= 8; }); + copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); @@ -11193,7 +11579,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, PostDupI16Shuffle[i / 2] = MappedMask; else assert(PostDupI16Shuffle[i / 2] == MappedMask && - "Conflicting entrties in the original shuffle!"); + "Conflicting entries in the original shuffle!"); } return DAG.getBitcast( MVT::v16i8, @@ -11365,7 +11751,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { @@ -11621,7 +12007,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, /// \brief Handle lowering 2-lane 128-bit shuffles. static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SmallVector<int, 4> WidenedMask; @@ -12091,7 +12477,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, unsigned &ShuffleImm, ArrayRef<int> Mask) { int NumElts = VT.getVectorNumElements(); - assert(VT.getScalarType() == MVT::f64 && + assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); @@ -12127,6 +12513,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { + assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& + "Unexpected data type for VSHUFPD"); + unsigned Immediate = 0; if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) return SDValue(); @@ -12153,7 +12542,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12250,7 +12639,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v4i64 shuffling.. static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12338,7 +12727,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 /// isn't available. static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12414,6 +12803,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code using vpunpcklwd and + // vpunpckhwd instrs than vblend. + if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) + if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) @@ -12429,7 +12826,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v8i32 shuffling.. static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12445,6 +12842,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code than vblend by using + // vpunpcklwd and vpunpckhwd instrs. + if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() && + !Subtarget.hasAVX512()) + if (SDValue V = + lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; @@ -12533,7 +12939,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v16i16 shuffling.. static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12619,7 +13025,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// This routine is only called when we have AVX2 and thus a reasonable /// instruction set for v32i8 shuffling.. static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12692,7 +13098,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // If we have a single input to the zero element, insert that into V1 if we @@ -12844,7 +13250,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT, /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12891,12 +13297,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. -static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, +static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12925,6 +13335,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } @@ -12938,7 +13352,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask, /// \brief Handle lowering of 8-lane 64-bit integer shuffles. static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12994,12 +13408,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13062,12 +13480,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, V1, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13109,12 +13530,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, } } + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, - const SmallBitVector &Zeroable, + const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -13159,6 +13584,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } @@ -13170,7 +13599,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, /// together based on the available instructions. static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, - const SmallBitVector &Zeroable, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX512() && @@ -13251,7 +13680,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, if (ISD::isBuildVectorAllZeros(V1.getNode())) V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V1.getNode())) - V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + V1 = getOnesVector(ExtVT, DAG, DL); else V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); @@ -13260,7 +13689,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, else if (ISD::isBuildVectorAllZeros(V2.getNode())) V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); else if (ISD::isBuildVectorAllOnes(V2.getNode())) - V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + V2 = getOnesVector(ExtVT, DAG, DL); else V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); @@ -13392,8 +13821,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - if (Zeroable.all()) + APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); // Try to collapse shuffles into using a vector type with fewer elements but @@ -13569,10 +13998,14 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const "Unexpected vector type in ExtractBitFromMaskVector"); // variable index can't be handled in mask registers, - // extend vector to VR512 + // extend vector to VR512/128 if (!isa<ConstantSDNode>(Idx)) { - MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec); + unsigned NumElts = VecVT.getVectorNumElements(); + // Extending v8i1/v16i1 to 512-bit get better performance on KNL + // than extending to 128/256bit. + unsigned VecSize = (NumElts <= 4 ? 128 : 512); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); @@ -13590,9 +14023,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const } unsigned MaxSift = VecVT.getVectorNumElements() - 1; if (MaxSift - IdxVal) - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, DAG.getIntPtrConstant(0, dl)); @@ -13610,24 +14043,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return ExtractBitFromMaskVector(Op, DAG); if (!isa<ConstantSDNode>(Idx)) { - if (VecVT.is512BitVector() || - (VecVT.is256BitVector() && Subtarget.hasInt256() && - VecVT.getScalarSizeInBits() == 32)) { - - MVT MaskEltVT = - MVT::getIntegerVT(VecVT.getScalarSizeInBits()); - MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() / - MaskEltVT.getSizeInBits()); + // Its more profitable to go through memory (1 cycles throughput) + // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) + // IACA tool was used to get performance estimation + // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) + // + // example : extractelement <16 x i8> %a, i32 %i + // + // Block Throughput: 3.00 Cycles + // Throughput Bottleneck: Port5 + // + // | Num Of | Ports pressure in cycles | | + // | Uops | 0 - DV | 5 | 6 | 7 | | + // --------------------------------------------- + // | 1 | | 1.0 | | | CP | vmovd xmm1, edi + // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 + // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 + // Total Num Of Uops: 4 + // + // + // Block Throughput: 1.00 Cycles + // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 + // + // | | Ports pressure in cycles | | + // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | + // --------------------------------------------------------- + // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 + // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] + // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] + // Total Num Of Uops: 4 - Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT, - getZeroVector(MaskVT, Subtarget, DAG, dl), Idx, - DAG.getConstant(0, dl, PtrVT)); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm, - DAG.getConstant(0, dl, PtrVT)); - } return SDValue(); } @@ -13675,7 +14120,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - // TODO: handle v16i8. + // TODO: We only extract a single element from v16i8, we can probably afford + // to be more aggressive here before using the default approach of spilling to + // stack. + if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { + // Extract either the lowest i32 or any i16, and extract the sub-byte. + int DWordIdx = IdxVal / 4; + if (DWordIdx == 0) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), + DAG.getIntPtrConstant(DWordIdx, dl)); + int ShiftVal = (IdxVal % 4) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, + DAG.getConstant(ShiftVal, dl, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + int WordIdx = IdxVal / 2; + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + DAG.getBitcast(MVT::v8i16, Vec), + DAG.getIntPtrConstant(WordIdx, dl)); + int ShiftVal = (IdxVal % 2) * 8; + if (ShiftVal != 0) + Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, + DAG.getConstant(ShiftVal, dl, MVT::i16)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } if (VT.getSizeInBits() == 32) { if (IdxVal == 0) @@ -13734,7 +14205,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if(Vec.isUndef()) { if (IdxVal) - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return EltInVec; } @@ -13744,21 +14215,21 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if (IdxVal == 0 ) { // EltInVec already at correct index and other bits are 0. // Clean the first bit in source vector. - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Clean the last bit in the source vector. - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); @@ -13790,17 +14261,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, auto *N2C = cast<ConstantSDNode>(N2); unsigned IdxVal = N2C->getZExtValue(); - // If we are clearing out a element, we do this more efficiently with a - // blend shuffle than a costly integer insertion. - // TODO: would other rematerializable values (e.g. allbits) benefit as well? + bool IsZeroElt = X86::isZeroNode(N1); + bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); + + // If we are inserting a element, see if we can do this more efficiently with + // a blend shuffle with a rematerializable vector than a costly integer + // insertion. // TODO: pre-SSE41 targets will tend to use bit masking - this could still // be beneficial if we are inserting several zeros and can combine the masks. - if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) { - SmallVector<int, 8> ClearMask; + if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) { + SmallVector<int, 8> BlendMask; for (unsigned i = 0; i != NumElts; ++i) - ClearMask.push_back(i == IdxVal ? i + NumElts : i); - SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask); + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) + : DAG.getConstant(-1, dl, VT); + return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert @@ -13837,25 +14312,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget.hasSSE41()) { - if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { - unsigned Opc; - if (VT == MVT::v8i16) { - Opc = X86ISD::PINSRW; - } else { - assert(VT == MVT::v16i8); - Opc = X86ISD::PINSRB; - } - - // Transform it so it match pinsr{b,w} which expects a GR32 as its second - // argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal, dl); - return DAG.getNode(Opc, dl, VT, N0, N1, N2); + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. SSE41 required for pinsrb. + if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { + unsigned Opc; + if (VT == MVT::v8i16) { + assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW"); + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector"); + assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB"); + Opc = X86ISD::PINSRB; } + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal, dl); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (Subtarget.hasSSE41()) { if (EltVT == MVT::f32) { // Bits [7:6] of the constant are the source select. This will always be // zero here. The DAG Combiner may combine an extract_elt index into @@ -13885,36 +14362,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); } - if (EltVT == MVT::i32 || EltVT == MVT::i64) { - // PINSR* works with constant index. + // PINSR* works with constant index. + if (EltVT == MVT::i32 || EltVT == MVT::i64) return Op; - } } - if (EltVT == MVT::i8) - return SDValue(); - - if (EltVT.getSizeInBits() == 16) { - // Transform it so it match pinsrw which expects a 16-bit value in a GR32 - // as its second argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(IdxVal, dl); - return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); - } return SDValue(); } -static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); MVT OpVT = Op.getSimpleValueType(); + // It's always cheaper to replace a xor+movd with xorps and simplifies further + // combines. + if (X86::isZeroNode(Op.getOperand(0))) + return getZeroVector(OpVT, Subtarget, DAG, dl); + // If this is a 256-bit vector result, first insert into a 128-bit // vector and then insert into the 256-bit vector. if (!OpVT.is128BitVector()) { // Insert into a 128-bit vector. - unsigned SizeFactor = OpVT.getSizeInBits()/128; + unsigned SizeFactor = OpVT.getSizeInBits() / 128; MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), OpVT.getVectorNumElements() / SizeFactor); @@ -13923,9 +14393,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { // Insert the 128-bit vector. return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); } + assert(OpVT.is128BitVector() && "Expected an SSE type!"); + + // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. + if (OpVT == MVT::v4i32) + return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); - assert(OpVT.is128BitVector() && "Expected an SSE type!"); return DAG.getBitcast( OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } @@ -13947,20 +14421,14 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, In.getSimpleValueType().is512BitVector()) && "Can only extract from 256-bit or 512-bit vectors"); - if (ResVT.is128BitVector()) - return extract128BitVector(In, IdxVal, DAG, dl); - if (ResVT.is256BitVector()) - return extract256BitVector(In, IdxVal, DAG, dl); - - llvm_unreachable("Unimplemented!"); -} + // If the input is a buildvector just emit a smaller one. + unsigned ElemsPerChunk = ResVT.getVectorNumElements(); + if (In.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT, + makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); -static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { - for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) - if (llvm::all_of(ValidUsers, - [&I](SDValue V) { return V.getNode() != *I; })) - return false; - return true; + // Everything else is legal. + return Op; } // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a @@ -13968,83 +14436,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) { // the upper bits of a vector. static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX"); - - SDLoc dl(Op); - SDValue Vec = Op.getOperand(0); - SDValue SubVec = Op.getOperand(1); - SDValue Idx = Op.getOperand(2); - - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - MVT OpVT = Op.getSimpleValueType(); - MVT SubVecVT = SubVec.getSimpleValueType(); - - if (OpVT.getVectorElementType() == MVT::i1) - return insert1BitVector(Op, DAG, Subtarget); - - assert((OpVT.is256BitVector() || OpVT.is512BitVector()) && - "Can only insert into 256-bit or 512-bit vectors"); + assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1); - // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte - // load: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr + 16), Elts/2) - // --> load32 addr - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr + 32), Elts/2) - // --> load64 addr - // or a 16-byte or 32-byte broadcast: - // (insert_subvector (insert_subvector undef, (load16 addr), 0), - // (load16 addr), Elts/2) - // --> X86SubVBroadcast(load16 addr) - // or: - // (insert_subvector (insert_subvector undef, (load32 addr), 0), - // (load32 addr), Elts/2) - // --> X86SubVBroadcast(load32 addr) - if ((IdxVal == OpVT.getVectorNumElements() / 2) && - Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { - auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); - if (Idx2 && Idx2->getZExtValue() == 0) { - SDValue SubVec2 = Vec.getOperand(1); - // If needed, look through bitcasts to get to the load. - if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { - bool Fast; - unsigned Alignment = FirstLd->getAlignment(); - unsigned AS = FirstLd->getAddressSpace(); - const X86TargetLowering *TLI = Subtarget.getTargetLowering(); - if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), - OpVT, AS, Alignment, &Fast) && Fast) { - SDValue Ops[] = {SubVec2, SubVec}; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; - } - } - // If lower/upper loads are the same and the only users of the load, then - // lower to a VBROADCASTF128/VBROADCASTI128/etc. - if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { - if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && - areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) { - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - } - } - // If this is subv_broadcast insert into both halves, use a larger - // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, - SubVec.getOperand(0)); - } - } - } - - if (SubVecVT.is128BitVector()) - return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl); - - if (SubVecVT.is256BitVector()) - return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - - llvm_unreachable("Unimplemented!"); + return insert1BitVector(Op, DAG, Subtarget); } // Returns the appropriate wrapper opcode for a global reference. @@ -14062,7 +14456,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const { } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as -// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected @@ -14438,7 +14832,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget.isTargetWindowsItanium() || Subtarget.isTargetWindowsGNU()) { // Just use the implicit TLS architecture - // Need to generate someting similar to: + // Need to generate something similar to: // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage // ; from TEB // mov ecx, dword [rel _tls_index]: Load index (from C runtime) @@ -15489,32 +15883,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // word to byte only under BWI if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, - DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } - // Truncate with PACKSS if we are truncating a vector comparison result. - // TODO: We should be able to support other operations as long as we - // we are saturating+packing zero/all bits only. - auto IsPackableComparison = [](SDValue V) { - unsigned Opcode = V.getOpcode(); - return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ || - Opcode == X86ISD::CMPP); - }; - - if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS && - all_of(In->ops(), IsPackableComparison))) { + // Truncate with PACKSS if we are truncating a vector zero/all-bits result. + if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In)) if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget)) return V; - } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; In = DAG.getBitcast(MVT::v8i32, In); - In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32), - ShufMask); + In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, DAG.getIntPtrConstant(0, DL)); } @@ -15530,30 +15913,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { } if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { - // On AVX2, v8i32 -> v8i16 becomed PSHUFB. + // On AVX2, v8i32 -> v8i16 becomes PSHUFB. if (Subtarget.hasInt256()) { In = DAG.getBitcast(MVT::v32i8, In); - SmallVector<SDValue,32> pshufbMask; - for (unsigned i = 0; i < 2; ++i) { - pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8)); - pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8)); - for (unsigned j = 0; j < 8; ++j) - pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); - } - SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask); - In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV); + // The PSHUFB mask: + static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, + -1, -1, -1, -1, -1, -1, -1, -1, + 16, 17, 20, 21, 24, 25, 28, 29, + -1, -1, -1, -1, -1, -1, -1, -1 }; + In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); In = DAG.getBitcast(MVT::v4i64, In); - static const int ShufMask[] = {0, 2, -1, -1}; - In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64), - ShufMask); + static const int ShufMask2[] = {0, 2, -1, -1}; + In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, DAG.getIntPtrConstant(0, DL)); return DAG.getBitcast(VT, In); @@ -15572,9 +15945,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1}; - SDValue Undef = DAG.getUNDEF(MVT::v16i8); - OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1); + OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1); + OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1); OpLo = DAG.getBitcast(MVT::v4i32, OpLo); OpHi = DAG.getBitcast(MVT::v4i32, OpHi); @@ -15598,17 +15970,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // Prepare truncation shuffle mask for (unsigned i = 0; i != NumElems; ++i) MaskVec[i] = i * 2; - SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In), - DAG.getUNDEF(NVT), MaskVec); + In = DAG.getBitcast(NVT, In); + SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, DAG.getIntPtrConstant(0, DL)); } -SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT; - MVT VT = Op.getSimpleValueType(); if (VT.isVector()) { @@ -15616,8 +15985,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SDValue Src = Op.getOperand(0); SDLoc dl(Op); if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) { - return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, - dl, VT, + return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32))); } @@ -15891,7 +16259,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, for (unsigned i = 0, e = VecIns.size(); i < e; ++i) VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); - // If more than one full vectors are evaluated, OR them first before PTEST. + // If more than one full vector is evaluated, OR them first before PTEST. for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { // Each iteration will OR 2 nodes and append the result until there is only // 1 node left, i.e. the final OR'd value of all vectors. @@ -15900,8 +16268,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, - VecIns.back(), VecIns.back()); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } /// \brief return true if \c Op has a use that doesn't just read flags. @@ -16366,7 +16733,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, } /// If we have at least two divisions that use the same divisor, convert to -/// multplication by a reciprocal. This may need to be adjusted for a given +/// multiplication by a reciprocal. This may need to be adjusted for a given /// CPU if a division's cost is not at least twice the cost of a multiplication. /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the @@ -17241,12 +17608,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y + // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y + // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); - - unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + unsigned CondCode = + cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { @@ -17283,6 +17652,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; + } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && + Cmp.getOperand(0).getOpcode() == ISD::AND && + isOneConstant(Cmp.getOperand(0).getOperand(1))) { + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue Src1, Src2; + // true if Op2 is XOR or OR operator and one of its operands + // is equal to Op1 + // ( a , a op b) || ( b , a op b) + auto isOrXorPattern = [&]() { + if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && + (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { + Src1 = + Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); + Src2 = Op1; + return true; + } + return false; + }; + + if (isOrXorPattern()) { + SDValue Neg; + unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); + // we need mask of all zeros or ones with same size of the other + // operands. + if (CmpSz > VT.getSizeInBits()) + Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); + else if (CmpSz < VT.getSizeInBits()) + Neg = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), + DAG.getConstant(1, DL, VT)); + else + Neg = CmpOp0; + SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Neg); // -(and (x, 0x1)) + SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z + return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y + } } } @@ -17423,17 +17829,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, // SKX processor if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || - - ((Subtarget.hasBWI() && VT.is512BitVector() && - VTElt.getSizeInBits() <= 16)) || + (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - ((Subtarget.hasDQI() && Subtarget.hasVLX() && - VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - ((Subtarget.hasDQI() && VT.is512BitVector() && - VTElt.getSizeInBits() >= 32)))) return DAG.getNode(X86ISD::VSEXT, dl, VT, In); unsigned NumElts = VT.getVectorNumElements(); @@ -17441,8 +17840,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, if (VT.is512BitVector() && InVTElt != MVT::i1 && (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) - return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG); + return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG); } if (InVTElt != MVT::i1) @@ -17454,10 +17853,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SDValue V; if (Subtarget.hasDQI()) { - V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); + V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG); assert(!VT.is512BitVector() && "Unexpected vector type"); } else { - SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); + SDValue NegOne = getOnesVector(ExtVT, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); if (ExtVT == VT) @@ -17506,11 +17905,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG || InVT == MVT::v64i8) && "Zero extend only for v64i8 input!"); - // SSE41 targets can use the pmovsx* instructions directly. - unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? - X86ISD::VSEXT : X86ISD::VZEXT; - if (Subtarget.hasSSE41()) + // SSE41 targets can use the pmovsx* instructions directly for 128-bit results, + // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still + // need to be handled here for 256/512-bit results. + if (Subtarget.hasInt256()) { + assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); + unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? + X86ISD::VSEXT : X86ISD::VZEXT; return DAG.getNode(ExtOpc, dl, VT, In); + } // We should only get here for sign extend. assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && @@ -17595,8 +17998,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); - OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); + OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT); + OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } @@ -17674,7 +18077,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, MVT VT = Op.getValueType().getSimpleVT(); unsigned NumElts = VT.getVectorNumElements(); - if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) || + if ((Subtarget.hasBWI() && NumElts >= 32) || + (Subtarget.hasDQI() && NumElts < 16) || NumElts == 16) { // Load and extend - everything is legal if (NumElts < 8) { @@ -17703,7 +18107,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, if (NumElts <= 8) { // A subset, assume that we have only AVX-512F - unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts; + unsigned NumBitsToLoad = 8; MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad); SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(), Ld->getBasePtr(), @@ -17911,7 +18315,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget.hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } @@ -18469,6 +18873,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SelectionDAG &DAG) { MVT ElementType = VT.getVectorElementType(); + // Bitcast the source vector to the output type, this is mainly necessary for + // vXi8/vXi64 shifts. + if (VT != SrcOp.getSimpleValueType()) + SrcOp = DAG.getBitcast(VT, SrcOp); + // Fold this packed shift into its first operand if ShiftAmt is 0. if (ShiftAmt == 0) return SrcOp; @@ -18485,9 +18894,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, && "Unknown target vector shift-by-constant node"); // Fold this packed vector shift into a build vector if SrcOp is a - // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT. - if (VT == SrcOp.getSimpleValueType() && - ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { + // vector of Constants or UNDEFs. + if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { SmallVector<SDValue, 8> Elts; unsigned NumElts = SrcOp->getNumOperands(); ConstantSDNode *ND; @@ -18578,11 +18986,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { ShAmt = ShAmt.getOperand(0); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; @@ -18853,6 +19261,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Src2 = Op.getOperand(2); SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, VT, Src1, Src2, Rnd), + Mask, passThru, Subtarget, DAG); + } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru, Subtarget, DAG); } @@ -19306,6 +19722,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Src2, Src1); return DAG.getBitcast(VT, Res); } + case MASK_BINOP: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2); + return DAG.getBitcast(VT, Res); + } case FIXUPIMMS: case FIXUPIMMS_MASKZ: case FIXUPIMM: @@ -19478,6 +19903,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case Intrinsic::x86_avx512_knot_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1); + SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); + return DAG.getBitcast(MVT::i16, Res); + } + + case Intrinsic::x86_avx512_kandn_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + // Invert LHS for the not. + LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, + DAG.getConstant(1, dl, MVT::v16i1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS); + return DAG.getBitcast(MVT::i16, Res); + } + + case Intrinsic::x86_avx512_kxnor_w: { + SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1)); + SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2)); + SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS); + // Invert result for the not. + Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res, + DAG.getConstant(1, dl, MVT::v16i1)); + return DAG.getBitcast(MVT::i16, Res); + } + case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -19603,6 +20055,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget } } +static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, + SDValue Src, SDValue Mask, SDValue Base, + SDValue Index, SDValue ScaleOp, SDValue Chain, + const X86Subtarget &Subtarget) { + SDLoc dl(Op); + auto *C = cast<ConstantSDNode>(ScaleOp); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + EVT MaskVT = Mask.getValueType(); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + // If source is undef or we know it won't be used, use a zero vector + // to break register dependency. + // TODO: use undef instead and let ExecutionDepsFix deal with it? + if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; + SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); + SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; + return DAG.getMergeValues(RetOps, dl); +} + static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, @@ -19617,7 +20091,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - if (Src.isUndef()) + // If source is undef or we know it won't be used, use a zero vector + // to break register dependency. + // TODO: use undef instead and let ExecutionDepsFix deal with it? + if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -19656,7 +20133,6 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - //SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); return SDValue(Res, 0); @@ -19928,6 +20404,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, SDValue(Result.getNode(), 2)); } + case GATHER_AVX2: { + SDValue Chain = Op.getOperand(0); + SDValue Src = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Index = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + SDValue Scale = Op.getOperand(6); + return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, + Scale, Chain, Subtarget); + } case GATHER: { //gather(v1, mask, index, base, scale); SDValue Chain = Op.getOperand(0); @@ -19953,8 +20439,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case PREFETCH: { SDValue Hint = Op.getOperand(6); unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); - assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1"); - unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); + assert((HintVal == 2 || HintVal == 3) && + "Wrong prefetch hint in intrinsic: should be 2 or 3"); + unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); @@ -20368,7 +20855,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // Check that ECX wasn't needed by an 'inreg' parameter. FunctionType *FTy = Func->getFunctionType(); - const AttributeSet &Attrs = Func->getAttributes(); + const AttributeList &Attrs = Func->getAttributes(); if (!Attrs.isEmpty() && !Func->isVarArg()) { unsigned InRegCount = 0; @@ -20802,9 +21289,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } -static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() == MVT::i1) - return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), +static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT.getScalarType() == MVT::i1) + return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && @@ -20812,14 +21300,23 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } -static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { - if (Op.getValueType() == MVT::i1) - return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(), - Op.getOperand(0), Op.getOperand(1)); +static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); - return Lower256IntArith(Op, DAG); + MVT VT = Op.getSimpleValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + SDLoc dl(Op); + SDValue Src = Op.getOperand(0); + SDValue Lo = extract128BitVector(Src, 0, DAG, dl); + SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl); + + MVT EltVT = VT.getVectorElementType(); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(ISD::ABS, dl, NewVT, Lo), + DAG.getNode(ISD::ABS, dl, NewVT, Hi)); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { @@ -20834,7 +21331,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - if (VT == MVT::i1) + if (VT.getScalarType() == MVT::i1) return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1)); // Decompose 256-bit ops into smaller 128-bit ops. @@ -20874,8 +21371,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); - BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); + ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT); + BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -20894,8 +21391,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); - BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); + AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT); + BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21056,8 +21553,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } - SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A); - SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B); + SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG); + SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, DAG.getConstant(8, dl, MVT::v16i16)); @@ -21073,8 +21570,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Extract the lo parts and zero/sign extend to i16. SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(ExSSE41, dl, ExVT, A); - BLo = DAG.getNode(ExSSE41, dl, ExVT, B); + ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG); + BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -21093,8 +21590,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); - BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); + AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG); + BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21148,8 +21645,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons MachinePointerInfo(), /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); } @@ -21157,11 +21654,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons getPointerTy(DAG.getDataLayout())); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(InChain) - .setCallee(getLibcallCallingConv(LC), - static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), - Callee, std::move(Args)) - .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); + CLI.setDebugLoc(dl) + .setChain(InChain) + .setLibCallee( + getLibcallCallingConv(LC), + static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, + std::move(Args)) + .setInRegister() + .setSExtResult(isSigned) + .setZExtResult(!isSigned); std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); return DAG.getBitcast(VT, CallInfo.first); @@ -21269,15 +21770,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() < 16) return false; - if (VT.is512BitVector() && + if (VT.is512BitVector() && Subtarget.hasAVX512() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; - bool LShift = VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasInt256()); + bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()); - bool AShift = LShift && (Subtarget.hasVLX() || - (VT != MVT::v2i64 && VT != MVT::v4i64)); + bool AShift = LShift && (Subtarget.hasAVX512() || + (VT != MVT::v2i64 && VT != MVT::v4i64)); return (Opcode == ISD::SRA) ? AShift : LShift; } @@ -21301,7 +21802,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; - if (VT.is512BitVector() || Subtarget.hasVLX()) + if (Subtarget.hasAVX512()) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); @@ -22062,10 +22563,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. if (isOneConstant(RHS)) { - BaseOp = X86ISD::INC; - Cond = X86::COND_O; - break; - } + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } BaseOp = X86ISD::ADD; Cond = X86::COND_O; break; @@ -22077,10 +22578,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. if (isOneConstant(RHS)) { - BaseOp = X86ISD::DEC; - Cond = X86::COND_O; - break; - } + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } BaseOp = X86ISD::SUB; Cond = X86::COND_O; break; @@ -22470,7 +22971,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, // index into a in-register pre-computed pop count table. We then split up the // input vector in two new ones: (1) a vector with only the shifted-right // higher nibbles for each byte and (2) a vector with the lower nibbles (and - // masked out higher ones) for each byte. PSHUB is used separately with both + // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. // @@ -22867,8 +23368,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, Entry.Node = Arg; Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; + Entry.IsSExt = false; + Entry.IsZExt = false; Args.push_back(Entry); bool isF64 = ArgVT == MVT::f64; @@ -22885,8 +23386,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, RetTy, Callee, std::move(Args)); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); @@ -23086,7 +23588,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && - "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); @@ -23142,7 +23644,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, // Mask element has to be i1. MVT MaskEltTy = Mask.getSimpleValueType().getScalarType(); assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) && - "We handle 4x32, 4x64 and 2x64 vectors only in this casse"); + "We handle 4x32, 4x64 and 2x64 vectors only in this case"); MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec); @@ -23202,7 +23704,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); - // The pass-thru value + // The pass-through value MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); Src0 = ExtendToType(Src0, NewVT, DAG); @@ -23284,7 +23786,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); - case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); @@ -23303,7 +23805,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SIGN_EXTEND_VECTOR_INREG: return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); case ISD::FABS: @@ -23360,12 +23862,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); - case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::ADD: + case ISD::SUB: return LowerADD_SUB(Op, DAG); case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); @@ -23768,7 +24271,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; - case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; @@ -23779,16 +24281,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::HSUB: return "X86ISD::HSUB"; case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; - case X86ISD::ABS: return "X86ISD::ABS"; case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMAXS: return "X86ISD::FMAXS"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; + case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMINS: return "X86ISD::FMINS"; case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND"; + case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND"; case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; + case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; @@ -23827,7 +24332,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND"; case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND"; @@ -23876,6 +24380,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; + case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; + case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -23976,9 +24482,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; + case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; + case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; + case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; + case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; @@ -24302,7 +24812,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB, for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) - MIB.addOperand(Op); + MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -24338,7 +24848,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB, for (unsigned i = 1; i < NumArgs; ++i) { MachineOperand &Op = MI.getOperand(i); if (!(Op.isReg() && Op.isImplicit())) - MIB.addOperand(Op); + MIB.add(Op); } if (MI.hasOneMemOperand()) MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); @@ -24398,7 +24908,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); unsigned ValOps = X86::AddrNumOperands; BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) @@ -24413,6 +24923,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB, return BB; } +static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB, + const X86Subtarget &Subtarget) { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + // Address into RAX/EAX + unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.add(MI->getOperand(i)); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + + + MachineBasicBlock * X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const { @@ -24536,12 +25066,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Load the offset value into a register OffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, UseFPOffset ? 4 : 0) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // Check if there is enough room left to pull this argument. BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) @@ -24561,12 +25091,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Read the reg_save_area address. unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 16) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 16) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // Zero-extend the offset unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); @@ -24588,13 +25118,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Store it back into the va_list. BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, UseFPOffset ? 4 : 0) - .addOperand(Segment) - .addReg(NextOffsetReg) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .add(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) @@ -24608,12 +25138,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Load the overflow_area address into a register. unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 8) - .addOperand(Segment) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 8) + .add(Segment) + .setMemRefs(MMOBegin, MMOEnd); // If we need to align it, do so. Otherwise, just copy the address // to OverflowDestReg. @@ -24644,13 +25174,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Store the new overflow address. BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) - .addOperand(Base) - .addOperand(Scale) - .addOperand(Index) - .addDisp(Disp, 8) - .addOperand(Segment) - .addReg(NextAddrReg) - .setMemRefs(MMOBegin, MMOEnd); + .add(Base) + .add(Scale) + .add(Index) + .addDisp(Disp, 8) + .add(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); // If we branched, emit the PHI to the front of endMBB. if (offsetMBB) { @@ -24867,7 +25397,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, // // (CMOV (CMOV F, T, cc1), T, cc2) // - // to two successives branches. For that, we look for another CMOV as the + // to two successive branches. For that, we look for another CMOV as the // following instruction. // // Without this, we would add a PHI between the two jumps, which ends up @@ -25123,12 +25653,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI, // instruction using the same address operands. if (Operand.isReg()) Operand.setIsKill(false); - MIB.addOperand(Operand); + MIB.add(Operand); } MachineInstr *FOpMI = MIB; MIB = BuildMI(*BB, MI, DL, TII->get(MOp)); for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; @@ -25508,7 +26038,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); else - MIB.addOperand(MI.getOperand(MemOpndSlot + i)); + MIB.add(MI.getOperand(MemOpndSlot + i)); } if (!UseImmLabel) MIB.addReg(LabelReg); @@ -25591,7 +26121,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, // Reload FP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); for (unsigned i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); MIB.setMemRefs(MMOBegin, MMOEnd); // Reload IP MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); @@ -25599,7 +26129,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), LabelOffset); else - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Reload SP @@ -25608,7 +26138,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, if (i == X86::AddrDisp) MIB.addDisp(MI.getOperand(i), SPOffset); else - MIB.addOperand(MI.getOperand(i)); + MIB.add(MI.getOperand(i)); } MIB.setMemRefs(MMOBegin, MMOEnd); // Jump @@ -25625,7 +26155,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); MVT PVT = getPointerTy(MF->getDataLayout()); assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"); @@ -25644,8 +26174,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, VR = MRI->createVirtualRegister(TRC); Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; - /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */ - if (Subtarget.is64Bit()) BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) .addReg(X86::RIP) @@ -25655,7 +26183,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, .addReg(0); else BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) - .addReg(0) /* XII->getGlobalBaseReg(MF) */ + .addReg(0) /* TII->getGlobalBaseReg(MF) */ .addImm(1) .addReg(0) .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) @@ -25677,7 +26205,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86InstrInfo *TII = Subtarget.getInstrInfo(); int FI = MFI.getFunctionContextIndex(); // Get a mapping of the call site numbers to all of the landing pads they're @@ -25749,9 +26277,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MF->getOrCreateJumpTableInfo(getJumpTableEncoding()); unsigned MJTI = JTI->createJumpTableIndex(LPadList); - const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); - const X86RegisterInfo &RI = XII->getRegisterInfo(); - + const X86RegisterInfo &RI = TII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { @@ -25799,8 +26325,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // N.B. the order the invoke BBs are processed in doesn't matter here. SmallVector<MachineBasicBlock *, 64> MBBLPads; - const MCPhysReg *SavedRegs = - Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF); + const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); for (MachineBasicBlock *MBB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. @@ -26033,6 +26558,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr); case X86::MONITORX: return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr); + + // Cache line zero + case X86::CLZERO: + return emitClzero(&MI, BB, Subtarget); + // PKU feature case X86::WRPKRU: return emitWRPKRU(MI, BB, Subtarget); @@ -26137,10 +26667,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = KnownZero.getBitWidth(); unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || @@ -26167,44 +26699,91 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; LLVM_FALLTHROUGH; case X86ISD::SETCC: - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); + KnownZero.setBits(1, BitWidth); break; case X86ISD::MOVMSK: { unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); + KnownZero.setBits(NumLoBits, BitWidth); + break; + } + case X86ISD::VSHLI: + case X86ISD::VSRLI: { + if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) { + KnownZero = APInt::getAllOnesValue(BitWidth); + break; + } + + DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1); + unsigned ShAmt = ShiftImm->getZExtValue(); + if (Opc == X86ISD::VSHLI) { + KnownZero = KnownZero << ShAmt; + KnownOne = KnownOne << ShAmt; + // Low bits are known zero. + KnownZero.setLowBits(ShAmt); + } else { + KnownZero = KnownZero.lshr(ShAmt); + KnownOne = KnownOne.lshr(ShAmt); + // High bits are known zero. + KnownZero.setHighBits(ShAmt); + } + } break; } case X86ISD::VZEXT: { SDValue N0 = Op.getOperand(0); - unsigned NumElts = Op.getValueType().getVectorNumElements(); - unsigned InNumElts = N0.getValueType().getVectorNumElements(); - unsigned InBitWidth = N0.getValueType().getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + + EVT SrcVT = N0.getValueType(); + unsigned InNumElts = SrcVT.getVectorNumElements(); + unsigned InBitWidth = SrcVT.getScalarSizeInBits(); + assert(InNumElts >= NumElts && "Illegal VZEXT input"); KnownZero = KnownOne = APInt(InBitWidth, 0); - APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts); - DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1); + APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); + DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1); KnownOne = KnownOne.zext(BitWidth); KnownZero = KnownZero.zext(BitWidth); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth); + KnownZero.setBits(InBitWidth, BitWidth); break; } } } unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, const SelectionDAG &DAG, unsigned Depth) const { - // SETCC_CARRY sets the dest to ~0 for true or 0 for false. - if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getScalarValueSizeInBits(); + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + unsigned Depth) const { + unsigned VTBits = Op.getScalarValueSizeInBits(); + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case X86ISD::SETCC_CARRY: + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + return VTBits; - if (Op.getOpcode() == X86ISD::VSEXT) { - EVT VT = Op.getValueType(); - EVT SrcVT = Op.getOperand(0).getValueType(); - unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits(); + case X86ISD::VSEXT: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + Tmp += VTBits - Src.getScalarValueSizeInBits(); return Tmp; } + case X86ISD::VSRAI: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + ShiftVal += Tmp; + return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); + } + + case X86ISD::PCMPGT: + case X86ISD::PCMPEQ: + case X86ISD::CMPP: + case X86ISD::VPCOM: + case X86ISD::VPCOMU: + // Vector compares return zero/all-bits result values. + return VTBits; + } + // Fallback case. return 1; } @@ -26228,24 +26807,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, // instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) { unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). - if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && - isUndefOrEqual(Mask[0], 0) && - isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } - - // Match against a VZEXT instruction. - // TODO: Add 256/512-bit vector support. - if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) { + // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. + // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { unsigned MaxScale = 64 / MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { bool Match = true; @@ -26255,19 +26827,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); } if (Match) { - SrcVT = MaskVT; + unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); + SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize); + if (SrcVT != MaskVT) + V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); - Shuffle = X86ISD::VZEXT; + Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT) + : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); return true; } } } + // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). + if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && + isUndefOrEqual(Mask[0], 0) && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { + Shuffle = X86ISD::VZEXT_MOVL; + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + return true; + } + // Check if we have SSE3 which will let us use MOVDDUP etc. The // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. - if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) { + if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { if (isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; @@ -26285,7 +26870,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - if (MaskVT.is256BitVector() && FloatDomain) { + if (MaskVT.is256BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) { Shuffle = X86ISD::MOVDDUP; @@ -26304,7 +26889,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } } - if (MaskVT.is512BitVector() && FloatDomain) { + if (MaskVT.is512BitVector() && AllowFloatDomain) { assert(Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"); if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) { @@ -26343,24 +26928,26 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, + bool AllowFloatDomain, + bool AllowIntDomain, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); bool ContainsZeros = false; - SmallBitVector Zeroable(NumMaskElts, false); + APInt Zeroable(NumMaskElts, false); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; - Zeroable[i] = isUndefOrZero(M); + if (isUndefOrZero(M)) + Zeroable.setBit(i); ContainsZeros |= (M == SM_SentinelZero); } // Attempt to match against byte/bit shifts. // FIXME: Add 512-bit support. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, MaskVT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); @@ -26423,19 +27010,21 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if (FloatDomain && !Subtarget.hasAVX()) + if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX()) return false; // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) - FloatDomain = true; + if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) { + AllowFloatDomain = true; + AllowIntDomain = false; + } // Check for lane crossing permutes. if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); + ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); PermuteImm = getV4X86ShuffleImm(Mask); return true; } @@ -26443,7 +27032,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, SmallVector<int, 4> RepeatedMask; if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { Shuffle = X86ISD::VPERMI; - ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); + ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); PermuteImm = getV4X86ShuffleImm(RepeatedMask); return true; } @@ -26452,7 +27041,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // VPERMILPD can permute with a non-repeating shuffle. - if (FloatDomain && MaskScalarSizeInBits == 64) { + if (AllowFloatDomain && MaskScalarSizeInBits == 64) { Shuffle = X86ISD::VPERMILPI; ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); PermuteImm = 0; @@ -26476,8 +27065,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, if (MaskScalarSizeInBits == 64) scaleShuffleMask(2, RepeatedMask, WordMask); - Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32); + Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); + ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32); ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); PermuteImm = getV4X86ShuffleImm(WordMask); return true; @@ -26487,34 +27076,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // shuffle instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, SDValue &V1, SDValue &V2, + bool AllowFloatDomain, bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); if (MaskVT.is128BitVector()) { - if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVLHPS; ShuffleVT = MVT::v4f32; return true; } - if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) { + if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVHLPS; ShuffleVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; ShuffleVT = MaskVT; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && - (FloatDomain || !Subtarget.hasSSE41())) { + (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; ShuffleVT = MaskVT; return true; @@ -26527,57 +27118,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { - MVT LegalVT = MaskVT; - if (LegalVT.is256BitVector() && !Subtarget.hasAVX2()) - LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); - - SmallVector<int, 64> Unpckl, Unpckh; - if (IsUnary) { - createUnpackShuffleMask(MaskVT, Unpckl, true, true); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - V2 = V1; - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - createUnpackShuffleMask(MaskVT, Unpckh, false, true); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - V2 = V1; - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } - } else { - createUnpackShuffleMask(MaskVT, Unpckl, true, false); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - createUnpackShuffleMask(MaskVT, Unpckh, false, false); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } - - ShuffleVectorSDNode::commuteMask(Unpckl); - if (isTargetShuffleEquivalent(Mask, Unpckl)) { - std::swap(V1, V2); - Shuffle = X86ISD::UNPCKL; - ShuffleVT = LegalVT; - return true; - } - - ShuffleVectorSDNode::commuteMask(Unpckh); - if (isTargetShuffleEquivalent(Mask, Unpckh)) { - std::swap(V1, V2); - Shuffle = X86ISD::UNPCKH; - ShuffleVT = LegalVT; - return true; - } + if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, + DAG, Subtarget)) { + ShuffleVT = MaskVT; + if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2()) + ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + return true; } } @@ -26585,17 +27131,19 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, - bool FloatDomain, - SDValue &V1, SDValue &V2, - SDLoc &DL, SelectionDAG &DAG, + bool AllowFloatDomain, + bool AllowIntDomain, + SDValue &V1, SDValue &V2, SDLoc &DL, + SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { unsigned NumMaskElts = Mask.size(); + unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); // Attempt to match against PALIGNR byte rotate. - if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; @@ -26606,77 +27154,74 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to combine to X86ISD::BLENDI. - if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || - (Subtarget.hasAVX() && MaskVT.is256BitVector()))) { - // Determine a type compatible with X86ISD::BLENDI. - // TODO - add 16i16 support (requires lane duplication). - MVT BlendVT = MaskVT; - if (Subtarget.hasAVX2()) { - if (BlendVT == MVT::v4i64) - BlendVT = MVT::v8i32; - else if (BlendVT == MVT::v2i64) - BlendVT = MVT::v4i32; - } else { - if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32) - BlendVT = MVT::v8i16; - else if (BlendVT == MVT::v4i64) - BlendVT = MVT::v4f64; - else if (BlendVT == MVT::v8i32) - BlendVT = MVT::v8f32; - } - - unsigned BlendSize = BlendVT.getVectorNumElements(); - unsigned MaskRatio = BlendSize / NumMaskElts; - - // Can we blend with zero? - if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts, - /*Low*/ 0) && - NumMaskElts <= BlendVT.getVectorNumElements()) { - PermuteImm = 0; - for (unsigned i = 0; i != BlendSize; ++i) - if (Mask[i / MaskRatio] < 0) - PermuteImm |= 1u << i; - - V2 = getZeroVector(BlendVT, Subtarget, DAG, DL); - Shuffle = X86ISD::BLENDI; - ShuffleVT = BlendVT; - return true; - } - - // Attempt to match as a binary blend. - if (NumMaskElts <= BlendVT.getVectorNumElements()) { - bool MatchBlend = true; - for (int i = 0; i != (int)NumMaskElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - else if (M == SM_SentinelZero) - MatchBlend = false; - else if ((M != i) && (M != (i + (int)NumMaskElts))) - MatchBlend = false; - } + if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || + (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || + (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { + uint64_t BlendMask = 0; + bool ForceV1Zero = false, ForceV2Zero = false; + SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); + if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, + BlendMask)) { + if (MaskVT == MVT::v16i16) { + // We can only use v16i16 PBLENDW if the lanes are repeated. + SmallVector<int, 8> RepeatedMask; + if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, + RepeatedMask)) { + assert(RepeatedMask.size() == 8 && + "Repeated mask size doesn't match!"); + PermuteImm = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 8) + PermuteImm |= 1 << i; + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + Shuffle = X86ISD::BLENDI; + ShuffleVT = MaskVT; + return true; + } + } else { + // Determine a type compatible with X86ISD::BLENDI. + ShuffleVT = MaskVT; + if (Subtarget.hasAVX2()) { + if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v8i32; + else if (ShuffleVT == MVT::v2i64) + ShuffleVT = MVT::v4i32; + } else { + if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32) + ShuffleVT = MVT::v8i16; + else if (ShuffleVT == MVT::v4i64) + ShuffleVT = MVT::v4f64; + else if (ShuffleVT == MVT::v8i32) + ShuffleVT = MVT::v8f32; + } - if (MatchBlend) { - PermuteImm = 0; - for (unsigned i = 0; i != BlendSize; ++i) - if ((int)NumMaskElts <= Mask[i / MaskRatio]) - PermuteImm |= 1u << i; + if (!ShuffleVT.isFloatingPoint()) { + int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits(); + BlendMask = + scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale); + ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale); + ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale); + } + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; + PermuteImm = (unsigned)BlendMask; Shuffle = X86ISD::BLENDI; - ShuffleVT = BlendVT; return true; } } } // Attempt to combine to INSERTPS. - if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) { - SmallBitVector Zeroable(4, false); + if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && + MaskVT.is128BitVector()) { + APInt Zeroable(4, 0); for (unsigned i = 0; i != NumMaskElts; ++i) if (Mask[i] < 0) - Zeroable[i] = true; + Zeroable.setBit(i); - if (Zeroable.any() && + if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; @@ -26685,22 +27230,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to combine to SHUFPD. - if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) || - (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) || - (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) { + if (AllowFloatDomain && EltSizeInBits == 64 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { Shuffle = X86ISD::SHUFP; - ShuffleVT = MaskVT; + ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; } } // Attempt to combine to SHUFPS. - if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || - (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || - (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) { + if (AllowFloatDomain && EltSizeInBits == 32 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { SmallVector<int, 4> RepeatedMask; if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { + // Match each half of the repeated mask, to determine if its just + // referencing one of the vectors, is zeroable or entirely undef. auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { int M0 = RepeatedMask[Offset]; int M1 = RepeatedMask[Offset + 1]; @@ -26732,7 +27281,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, V1 = Lo; V2 = Hi; Shuffle = X86ISD::SHUFP; - ShuffleVT = MaskVT; + ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); PermuteImm = getV4X86ShuffleImm(ShufMask); return true; } @@ -26764,7 +27313,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // here, we're not going to remove the operands we find. bool UnaryShuffle = (Inputs.size() == 1); SDValue V1 = peekThroughBitcasts(Inputs[0]); - SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1])); + SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) + : peekThroughBitcasts(Inputs[1])); MVT VT1 = V1.getSimpleValueType(); MVT VT2 = V2.getSimpleValueType(); @@ -26853,6 +27403,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, MVT ShuffleSrcVT, ShuffleVT; unsigned Shuffle, PermuteImm; + // Which shuffle domains are permitted? + // Permit domain crossing at higher combine depths. + bool AllowFloatDomain = FloatDomain || (Depth > 3); + bool AllowIntDomain = !FloatDomain || (Depth > 3); + if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load // directly if we don't shuffle the lower element and we shuffle the upper @@ -26869,8 +27424,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle, - ShuffleSrcVT, ShuffleVT)) { + if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26884,8 +27440,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, - Shuffle, ShuffleVT, PermuteImm)) { + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + AllowIntDomain, Subtarget, Shuffle, + ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26901,8 +27458,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget, - Shuffle, ShuffleVT, UnaryShuffle)) { + if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, + V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, + UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) @@ -26918,8 +27476,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL, - DAG, Subtarget, Shuffle, ShuffleVT, + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + AllowIntDomain, V1, V2, DL, DAG, + Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) return false; // Nothing to do! @@ -27039,12 +27598,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits); - SmallBitVector UndefElts(NumMaskElts, false); + APInt UndefElts(NumMaskElts, 0); SmallVector<APInt, 64> EltBits(NumMaskElts, Zero); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } if (M == SM_SentinelZero) @@ -27228,8 +27787,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, // Extract constant bits from each source op. bool OneUseConstantOp = false; - SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps); - SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps); + SmallVector<APInt, 16> UndefEltsOps(NumOps); + SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps); for (unsigned i = 0; i != NumOps; ++i) { SDValue SrcOp = Ops[i]; OneUseConstantOp |= SrcOp.hasOneUse(); @@ -27245,18 +27804,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, return false; // Shuffle the constant bits according to the mask. - SmallBitVector UndefElts(NumMaskElts, false); - SmallBitVector ZeroElts(NumMaskElts, false); - SmallBitVector ConstantElts(NumMaskElts, false); + APInt UndefElts(NumMaskElts, 0); + APInt ZeroElts(NumMaskElts, 0); + APInt ConstantElts(NumMaskElts, 0); SmallVector<APInt, 8> ConstantBitData(NumMaskElts, APInt::getNullValue(MaskSizeInBits)); for (unsigned i = 0; i != NumMaskElts; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } else if (M == SM_SentinelZero) { - ZeroElts[i] = true; + ZeroElts.setBit(i); continue; } assert(0 <= M && M < (int)(NumMaskElts * NumOps)); @@ -27266,21 +27825,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; if (SrcUndefElts[SrcMaskIdx]) { - UndefElts[i] = true; + UndefElts.setBit(i); continue; } auto &SrcEltBits = RawBitsOps[SrcOpIdx]; APInt &Bits = SrcEltBits[SrcMaskIdx]; if (!Bits) { - ZeroElts[i] = true; + ZeroElts.setBit(i); continue; } - ConstantElts[i] = true; + ConstantElts.setBit(i); ConstantBitData[i] = Bits; } - assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts); + assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()); // Create the constant data. MVT MaskSVT; @@ -27330,6 +27889,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root, ArrayRef<int> RootMask, + ArrayRef<const SDNode*> SrcNodes, int Depth, bool HasVariableMask, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -27353,13 +27913,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. - SDValue Input0, Input1; - SmallVector<int, 16> OpMask; - if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) + SmallVector<int, 64> OpMask; + SmallVector<SDValue, 2> OpInputs; + if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask)) return false; + assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); + SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); + SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue()); + // Add the inputs to the Ops list, avoiding duplicates. - SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end()); + SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); int InputIdx0 = -1, InputIdx1 = -1; for (int i = 0, e = Ops.size(); i < e; ++i) { @@ -27392,8 +27956,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, (RootRatio == 1) != (OpRatio == 1)) && "Must not have a ratio for both incoming and op masks!"); - SmallVector<int, 16> Mask; - Mask.reserve(MaskWidth); + SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef); // Merge this shuffle operation's mask into our accumulated mask. Note that // this shuffle's mask will be the first applied to the input, followed by the @@ -27403,7 +27966,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, int RootIdx = i / RootRatio; if (RootMask[RootIdx] < 0) { // This is a zero or undef lane, we're done. - Mask.push_back(RootMask[RootIdx]); + Mask[i] = RootMask[RootIdx]; continue; } @@ -27413,7 +27976,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, // than the SrcOp we're currently inserting. if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { - Mask.push_back(RootMaskedIdx); + Mask[i] = RootMaskedIdx; continue; } @@ -27423,7 +27986,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, if (OpMask[OpIdx] < 0) { // The incoming lanes are zero or undef, it doesn't matter which ones we // are using. - Mask.push_back(OpMask[OpIdx]); + Mask[i] = OpMask[OpIdx]; continue; } @@ -27439,7 +28002,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, OpMaskedIdx += InputIdx1 * MaskWidth; } - Mask.push_back(OpMaskedIdx); + Mask[i] = OpMaskedIdx; } // Handle the all undef/zero cases early. @@ -27457,28 +28020,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, } // Remove unused shuffle source ops. - SmallVector<SDValue, 8> UsedOps; - for (int i = 0, e = Ops.size(); i < e; ++i) { - int lo = UsedOps.size() * MaskWidth; - int hi = lo + MaskWidth; - if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { - UsedOps.push_back(Ops[i]); - continue; - } - for (int &M : Mask) - if (lo <= M) - M -= MaskWidth; - } - assert(!UsedOps.empty() && "Shuffle with no inputs detected"); - Ops = UsedOps; + resolveTargetShuffleInputsAndMask(Ops, Mask); + assert(!Ops.empty() && "Shuffle with no inputs detected"); HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); - // See if we can recurse into each shuffle source op (if it's a target shuffle). + // Update the list of shuffle nodes that have been combined so far. + SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(), + SrcNodes.end()); + CombinedNodes.push_back(Op.getNode()); + + // See if we can recurse into each shuffle source op (if it's a target + // shuffle). The source op should only be combined if it either has a + // single use (i.e. current Op) or all its users have already been combined. for (int i = 0, e = Ops.size(); i < e; ++i) - if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode())) - if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1, - HasVariableMask, DAG, DCI, Subtarget)) + if (Ops[i].getNode()->hasOneUse() || + SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) + if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes, + Depth + 1, HasVariableMask, DAG, DCI, + Subtarget)) return true; // Attempt to constant fold all of the constant source ops. @@ -27495,7 +28055,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - SmallVector<int, 16> WidenedMask; + SmallVector<int, 64> WidenedMask; while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { Mask = std::move(WidenedMask); } @@ -27561,8 +28121,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { /// altering anything. static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); @@ -27842,19 +28401,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } case X86ISD::MOVSD: case X86ISD::MOVSS: { - bool isFloat = VT.isFloatingPoint(); SDValue V0 = peekThroughBitcasts(N->getOperand(0)); SDValue V1 = peekThroughBitcasts(N->getOperand(1)); - bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); - bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode()); bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode()); - assert(!(isZero0 && isZero1) && "Zeroable shuffle detected."); + if (isZero0 && isZero1) + return SDValue(); // We often lower to MOVSD/MOVSS from integer as well as native float // types; remove unnecessary domain-crossing bitcasts if we can to make it // easier to combine shuffles later on. We've already accounted for the // domain switching cost when we decided to lower with it. + bool isFloat = VT.isFloatingPoint(); + bool isFloat0 = V0.getSimpleValueType().isFloatingPoint(); + bool isFloat1 = V1.getSimpleValueType().isFloatingPoint(); if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) { MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32) : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32); @@ -28025,7 +28585,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG)) return NewN; break; @@ -28173,12 +28733,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); - - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) - return SDValue(); - // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. if (TLI.isTypeLegal(VT)) @@ -28249,11 +28804,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are // consecutive, non-overlapping, and in the right order. SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) - Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + Elts.push_back(Elt); + continue; + } + Elts.clear(); + break; + } - if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) - return LD; + if (Elts.size() == VT.getVectorNumElements()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) + return LD; // For AVX2, we sometimes want to combine // (vector_shuffle <mask> (concat_vectors t1, undef) @@ -28276,7 +28838,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // a particular chain. SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -28303,18 +28865,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EVT OriginalVT = InVec.getValueType(); - if (InVec.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - EVT BCVT = InVec.getOperand(0).getValueType(); - if (!BCVT.isVector() || - BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) - return SDValue(); - InVec = InVec.getOperand(0); - } + // Peek through bitcasts, don't duplicate a load with other uses. + InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); + if (!CurrentVT.isVector() || + CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -28393,19 +28950,41 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + + // Since MMX types are special and don't usually play with other vector types, + // it's better to handle them early to be sure we emit efficient code by + // avoiding store-load conversions. - // Detect bitcasts between i32 to x86mmx low word. Since MMX types are - // special and don't usually play with other vector types, it's better to - // handle them early to be sure we emit efficient code by avoiding - // store-load conversions. + // Detect bitcasts between i32 to x86mmx low word. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && - N0.getValueType() == MVT::v2i32 && - isNullConstant(N0.getOperand(1))) { + SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); if (N00.getValueType() == MVT::i32) return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } + // Detect bitcasts between element or subvector extraction to x86mmx. + if (VT == MVT::x86mmx && + (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType().is128BitVector()) + return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, + DAG.getBitcast(MVT::v2i64, N00)); + } + + // Detect bitcasts from FP_TO_SINT to x86mmx. + if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 && + N0.getOpcode() == ISD::FP_TO_SINT) { + SDLoc DL(N0); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, + DAG.getUNDEF(MVT::v2i32)); + return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, + DAG.getBitcast(MVT::v2i64, Res)); + } + // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the @@ -28511,12 +29090,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, if (SetCC.getOpcode() != ISD::SETCC) return false; ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT) + if (CC != ISD::SETGT && CC != ISD::SETLT) return false; SDValue SelectOp1 = Select->getOperand(1); SDValue SelectOp2 = Select->getOperand(2); + // The following instructions assume SelectOp1 is the subtraction operand + // and SelectOp2 is the negation operand. + // In the case of SETLT this is the other way around. + if (CC == ISD::SETLT) + std::swap(SelectOp1, SelectOp2); + // The second operand of the select should be the negation of the first // operand, which is implemented as 0 - SelectOp1. if (!(SelectOp2.getOpcode() == ISD::SUB && @@ -28529,8 +29114,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, if (SetCC.getOperand(0) != SelectOp1) return false; - // The second operand of the comparison can be either -1 or 0. - if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + // In SetLT case, The second operand of the comparison can be either 1 or 0. + APInt SplatVal; + if ((CC == ISD::SETLT) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && + SplatVal == 1) || + (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) + return false; + + // In SetGT case, The second operand of the comparison can be either -1 or 0. + if ((CC == ISD::SETGT) && + !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) return false; @@ -28576,17 +29170,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. +static SDValue combineHorizontalPredicateResult(SDNode *Extract, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE2 or with AVX512VL (which uses predicate registers). + if (!Subtarget.hasSSE2() || Subtarget.hasVLX()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + unsigned BitWidth = ExtractVT.getSizeInBits(); + if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && + ExtractVT != MVT::i8) + return SDValue(); + + // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. + for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { + SDValue Match = matchBinOpReduction(Extract, Op); + if (!Match) + continue; + + // EXTRACT_VECTOR_ELT can require implicit extension of the vector element + // which we can't support here for now. + if (Match.getScalarValueSizeInBits() != BitWidth) + continue; + + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + return SDValue(); + + // Don't bother performing this for 2-element vectors. + if (Match.getValueType().getVectorNumElements() <= 2) + return SDValue(); + + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); + + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) + MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + APInt CompareBits; + ISD::CondCode CondCode; + if (Op == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; + } + + // Perform the select as i32/i64 and then truncate to avoid partial register + // stalls. + unsigned ResWidth = std::max(BitWidth, 32u); + EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); + SDLoc DL(Extract); + SDValue Zero = DAG.getConstant(0, DL, ResVT); + SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + Ones, Zero, CondCode); + return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + } + + return SDValue(); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is appropriate - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. + // Verify the type we're extracting from is any integer type above i16. EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32)) + if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) return SDValue(); unsigned RegSize = 128; @@ -28595,15 +29264,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, else if (Subtarget.hasAVX2()) RegSize = 256; - // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512. + // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512. // TODO: We should be able to handle larger vectors by splitting them before // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) + if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); // Match shuffle + add pyramid. SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + // The operand is expected to be zero extended from i8 + // (verified in detectZextAbsDiff). + // In order to convert to i64 and above, additional any/zero/sign + // extend is expected. + // The zero extend from 32 bit has no mathematical effect on the result. + // Also the sign extend is basically zero extend + // (extends the sign bit which is zero). + // So it is correct to skip the sign/zero extend instruction. + if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) + Root = Root.getOperand(0); + // If there was a match, we want Root to be a select that is the root of an // abs-diff pattern. if (!Root || (Root.getOpcode() != ISD::VSELECT)) @@ -28614,7 +29296,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, if (!detectZextAbsDiff(Root, Zext0, Zext1)) return SDValue(); - // Create the SAD instruction + // Create the SAD instruction. SDLoc DL(Extract); SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL); @@ -28636,13 +29318,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, } } - // Return the lowest i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32); + MVT Type = Extract->getSimpleValueType(0); + unsigned TypeSizeInBits = Type.getSizeInBits(); + // Return the lowest TypeSizeInBits bits. + MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } +// Attempt to peek through a target shuffle and extract the scalar from the +// source. +static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Src = N->getOperand(0); + SDValue Idx = N->getOperand(1); + + EVT VT = N->getValueType(0); + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + + // Don't attempt this for boolean mask vectors or unknown extraction indices. + if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Resolve the target shuffle inputs and mask. + SmallVector<int, 16> Mask; + SmallVector<SDValue, 2> Ops; + if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask)) + return SDValue(); + + // Attempt to narrow/widen the shuffle mask to the correct size. + if (Mask.size() != NumSrcElts) { + if ((NumSrcElts % Mask.size()) == 0) { + SmallVector<int, 16> ScaledMask; + int Scale = NumSrcElts / Mask.size(); + scaleShuffleMask(Scale, Mask, ScaledMask); + Mask = std::move(ScaledMask); + } else if ((Mask.size() % NumSrcElts) == 0) { + SmallVector<int, 16> WidenedMask; + while (Mask.size() > NumSrcElts && + canWidenShuffleElements(Mask, WidenedMask)) + Mask = std::move(WidenedMask); + // TODO - investigate support for wider shuffle masks with known upper + // undef/zero elements for implicit zero-extension. + } + } + + // Check if narrowing/widening failed. + if (Mask.size() != NumSrcElts) + return SDValue(); + + int SrcIdx = Mask[N->getConstantOperandVal(1)]; + SDLoc dl(N); + + // If the shuffle source element is undef/zero then we can just accept it. + if (SrcIdx == SM_SentinelUndef) + return DAG.getUNDEF(VT); + + if (SrcIdx == SM_SentinelZero) + return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) + : DAG.getConstant(0, dl, VT); + + SDValue SrcOp = Ops[SrcIdx / Mask.size()]; + SrcOp = DAG.getBitcast(SrcVT, SrcOp); + SrcIdx = SrcIdx % Mask.size(); + + // We can only extract other elements from 128-bit vectors and in certain + // circumstances, depending on SSE-level. + // TODO: Investigate using extract_subvector for larger vectors. + // TODO: Investigate float/double extraction if it will be just stored. + if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) && + ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { + assert(SrcSVT == VT && "Unexpected extraction type"); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp, + DAG.getIntPtrConstant(SrcIdx, dl)); + } + + if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) || + (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) { + assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && + "Unexpected extraction type"); + unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); + SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, + DAG.getIntPtrConstant(SrcIdx, dl)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp, + DAG.getValueType(SrcSVT)); + return DAG.getZExtOrTrunc(Assert, dl, VT); + } + + return SDValue(); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -28653,14 +29425,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; + if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) + return NewOp; + SDValue InputVector = N->getOperand(0); + SDValue EltIdx = N->getOperand(1); + + EVT SrcVT = InputVector.getValueType(); + EVT VT = N->getValueType(0); SDLoc dl(InputVector); + + // Detect mmx extraction of all bits as a i64. It works better as a bitcast. + if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && + VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { + SDValue MMXSrc = InputVector.getOperand(0); + + // The bitcast source is a direct mmx result. + if (MMXSrc.getValueType() == MVT::x86mmx) + return DAG.getBitcast(VT, InputVector); + } + // Detect mmx to i32 conversion through a v2i32 elt extract. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && - N->getValueType(0) == MVT::i32 && - InputVector.getValueType() == MVT::v2i32 && - isa<ConstantSDNode>(N->getOperand(1)) && - N->getConstantOperandVal(1) == 0) { + VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) { SDValue MMXSrc = InputVector.getOperand(0); // The bitcast source is a direct mmx result. @@ -28668,15 +29455,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc); } - EVT VT = N->getValueType(0); - - if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && - InputVector.getOpcode() == ISD::BITCAST && + if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST && + isa<ConstantSDNode>(EltIdx) && isa<ConstantSDNode>(InputVector.getOperand(0))) { - uint64_t ExtractedElt = - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - uint64_t InputValue = - cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + uint64_t ExtractedElt = N->getConstantOperandVal(1); + uint64_t InputValue = InputVector.getConstantOperandVal(0); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } @@ -28687,9 +29470,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) return SAD; + // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. + if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) + return Cmp; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. - if (InputVector.getValueType() != MVT::v4i32) + if (SrcVT != MVT::v4i32) return SDValue(); // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a @@ -28717,9 +29504,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); // Record which element was extracted. - ExtractedElements |= - 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); - + ExtractedElements |= 1 << Extract->getConstantOperandVal(1); Uses.push_back(Extract); } @@ -28752,11 +29537,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); } else { // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue StackPtr = DAG.CreateStackTemporary(SrcVT); SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, MachinePointerInfo()); - EVT ElementType = InputVector.getValueType().getVectorElementType(); + EVT ElementType = SrcVT.getVectorElementType(); unsigned EltSize = ElementType.getSizeInBits() / 8; // Replace each use (extract) with a load of the appropriate element. @@ -28779,8 +29564,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; - SDValue Idx = Extract->getOperand(1); - uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + uint64_t IdxVal = Extract->getConstantOperandVal(1); DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } @@ -28788,6 +29572,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// TODO - merge with combineExtractVectorElt once it can handle the implicit +// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in: +// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and +// combineBasicSADPattern. +static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + return combineExtractWithShuffle(N, DAG, DCI, Subtarget); +} + /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. static SDValue @@ -28812,12 +29606,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, // This situation only applies to avx512. if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { - //Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()), - DL, CondVT)); - //Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + // Invert the cond to not(cond) : xor(op,allones)=not(op) + SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getAllOnesConstant(DL, CondVT)); + // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 + return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that @@ -28920,18 +29713,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { DAG.getConstant(ShAmt, DL, MVT::i8)); } - // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. - if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) { - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); - return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(FalseC, 0)); - } - // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { @@ -29049,7 +29830,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return false; MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); // Only change element size, not type. - if (VT.isInteger() != OpEltVT.isInteger()) + if (EltVT.isInteger() != OpEltVT.isInteger()) return false; uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; @@ -29063,7 +29844,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DCI.AddToWorklist(Op1.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, Op1, - DAG.getConstant(Imm, DL, MVT::i8))); + DAG.getIntPtrConstant(Imm, DL))); return true; } case ISD::EXTRACT_SUBVECTOR: { @@ -29072,7 +29853,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return false; MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); // Only change element size, not type. - if (VT.isInteger() != OpEltVT.isInteger()) + if (EltVT.isInteger() != OpEltVT.isInteger()) return false; uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; @@ -29084,7 +29865,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, DCI.AddToWorklist(Op0.getNode()); DCI.CombineTo(OrigOp.getNode(), DAG.getNode(Opcode, DL, VT, Op0, - DAG.getConstant(Imm, DL, MVT::i8))); + DAG.getIntPtrConstant(Imm, DL))); + return true; + } + case X86ISD::SUBV_BROADCAST: { + unsigned EltSize = EltVT.getSizeInBits(); + if (EltSize != 32 && EltSize != 64) + return false; + // Only change element size, not type. + if (VT.isInteger() != Op.getSimpleValueType().isInteger()) + return false; + SDValue Op0 = Op.getOperand(0); + MVT Op0VT = MVT::getVectorVT(EltVT, + Op0.getSimpleValueType().getSizeInBits() / EltSize); + Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0)); + DCI.AddToWorklist(Op0.getNode()); + DCI.CombineTo(OrigOp.getNode(), + DAG.getNode(Opcode, DL, VT, Op0)); return true; } } @@ -29370,8 +30167,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If this is a *dynamic* select (non-constant condition) and we can match // this node with one of the variable blend instructions, restructure the - // condition so that the blends can use the high bit of each element and use - // SimplifyDemandedBits to simplify the condition operand. + // condition so that blends can use the high (sign) bit of each element and + // use SimplifyDemandedBits to simplify the condition operand. if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { @@ -29406,49 +30203,45 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); - APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); - + APInt DemandedMask(APInt::getSignBit(BitWidth)); APInt KnownZero, KnownOne; TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) { - // If we changed the computation somewhere in the DAG, this change - // will affect all users of Cond. - // Make sure it is fine and update all the nodes so that we do not - // use the generic VSELECT anymore. Otherwise, we may perform - // wrong optimizations as we messed up with the actual expectation + // If we changed the computation somewhere in the DAG, this change will + // affect all users of Cond. Make sure it is fine and update all the nodes + // so that we do not use the generic VSELECT anymore. Otherwise, we may + // perform wrong optimizations as we messed with the actual expectation // for the vector boolean values. if (Cond != TLO.Old) { - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are - // set properly. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. + // Check all uses of the condition operand to check whether it will be + // consumed by non-BLEND instructions. Those may require that all bits + // are set properly. + for (SDNode *U : Cond->uses()) { + // TODO: Add other opcodes eventually lowered into BLEND. + if (U->getOpcode() != ISD::VSELECT) return SDValue(); + } - // Update all the users of the condition, before committing the change, - // so that the VSELECT optimizations that expect the correct vector - // boolean value will not be triggered. - for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); - I != E; ++I) - DAG.ReplaceAllUsesOfValueWith( - SDValue(*I, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), - Cond, I->getOperand(1), I->getOperand(2))); + // Update all users of the condition before committing the change, so + // that the VSELECT optimizations that expect the correct vector boolean + // value will not be triggered. + for (SDNode *U : Cond->uses()) { + SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), + U->getValueType(0), Cond, U->getOperand(1), + U->getOperand(2)); + DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); + } DCI.CommitTargetLoweringOpt(TLO); return SDValue(); } - // At this point, only Cond is changed. Change the condition - // just for N to keep the opportunity to optimize all other - // users their own way. - DAG.ReplaceAllUsesOfValueWith( - SDValue(N, 0), - DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), - TLO.New, N->getOperand(1), N->getOperand(2))); + // Only Cond (rather than other nodes in the computation chain) was + // changed. Change the condition just for N to keep the opportunity to + // optimize all other users their own way. + SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB); return SDValue(); } } @@ -29456,7 +30249,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Look for vselects with LHS/RHS being bitcasted from an operation that // can be executed on another type. Push the bitcast to the inputs of // the operation. This exposes opportunities for using masking instructions. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() && + if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() && CondVT.getVectorElementType() == MVT::i1) { if (combineBitcastForMaskedOp(LHS, DAG, DCI)) return SDValue(N, 0); @@ -30208,22 +31001,37 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, } if (!NewMul) { - assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) - && "Both cases that could cause potential overflows should have " - "already been handled."); - if (isPowerOf2_64(MulAmt - 1)) - // (mul x, 2^N + 1) => (add (shl x, N), x) - NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), - DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt - 1), DL, - MVT::i8))); - - else if (isPowerOf2_64(MulAmt + 1)) - // (mul x, 2^N - 1) => (sub (shl x, N), x) - NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, - N->getOperand(0), - DAG.getConstant(Log2_64(MulAmt + 1), - DL, MVT::i8)), N->getOperand(0)); + assert(MulAmt != 0 && + MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && + "Both cases that could cause potential overflows should have " + "already been handled."); + int64_t SignMulAmt = C->getSExtValue(); + if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) && + (SignMulAmt != -INT64_MAX)) { + int NumSign = SignMulAmt > 0 ? 1 : -1; + bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1); + bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1); + if (IsPowerOf2_64PlusOne) { + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode( + ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL, + MVT::i8))); + } else if (IsPowerOf2_64MinusOne) { + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode( + ISD::SUB, DL, VT, + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL, + MVT::i8)), + N->getOperand(0)); + } + // To negate, subtract the number from zero + if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1) + NewMul = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul); + } } if (NewMul) @@ -30396,42 +31204,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) && - "Unexpected opcode"); +static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || + X86ISD::VSRLI == Opcode) && + "Unexpected shift opcode"); + bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - - // This fails for mask register (vXi1) shifts. - if ((NumBitsPerElt % 8) != 0) - return SDValue(); + assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && + "Unexpected value type"); // Out of range logical bit shifts are guaranteed to be zero. - APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); - if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // Out of range arithmetic bit shifts splat the sign bit. + APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue(); + if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) { + if (LogicalShift) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + else + ShiftVal = NumBitsPerElt - 1; + } // Shift N0 by zero -> N0. if (!ShiftVal) - return N->getOperand(0); + return N0; // Shift zero -> zero. - if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + if (ISD::isBuildVectorAllZeros(N0.getNode())) return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N)); + // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31). + // This VSRLI only looks at the sign bit, which is unmodified by VSRAI. + // TODO - support other sra opcodes as needed. + if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt && + N0.getOpcode() == X86ISD::VSRAI) + return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); + // We can decode 'whole byte' logical bit shifts as shuffles. - if ((ShiftVal.getZExtValue() % 8) == 0) { + if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. } + // Constant Folding. + APInt UndefElts; + SmallVector<APInt, 32> EltBits; + if (N->isOnlyUserOf(N0.getNode()) && + getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) { + assert(EltBits.size() == VT.getVectorNumElements() && + "Unexpected shift value type"); + unsigned ShiftImm = ShiftVal.getZExtValue(); + for (APInt &Elt : EltBits) { + if (X86ISD::VSHLI == Opcode) + Elt = Elt.shl(ShiftImm); + else if (X86ISD::VSRAI == Opcode) + Elt = Elt.ashr(ShiftImm); + else + Elt = Elt.lshr(ShiftImm); + } + return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); + } + + return SDValue(); +} + +static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + assert( + ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) || + (N->getOpcode() == X86ISD::PINSRW && + N->getValueType(0) == MVT::v8i16)) && + "Unexpected vector insertion"); + + // Attempt to combine PINSRB/PINSRW patterns to a shuffle. + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget); return SDValue(); } @@ -30550,33 +31411,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64) return SDValue(); - // Canonicalize XOR to the left. - if (N1.getOpcode() == ISD::XOR) - std::swap(N0, N1); + if (N0.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); - if (N0.getOpcode() != ISD::XOR) - return SDValue(); - - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); + if (N1.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); - N01 = peekThroughBitcasts(N01); - - // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an - // insert_subvector building a 256-bit AllOnes vector. - if (!ISD::isBuildVectorAllOnes(N01.getNode())) { - if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR) - return SDValue(); - - SDValue V1 = N01->getOperand(0); - SDValue V2 = N01->getOperand(1); - if (V1.getOpcode() != ISD::INSERT_SUBVECTOR || - !V1.getOperand(0).isUndef() || - !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) || - !ISD::isBuildVectorAllOnes(V2.getNode())) - return SDValue(); - } - return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1); + return SDValue(); } // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized @@ -30696,38 +31539,34 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is -/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to -/// eliminate loading the vector constant mask value. This relies on the fact -/// that a PCMP always creates an all-ones or all-zeros bitmask per element. -static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) { +/// If this is a zero/all-bits result that is bitwise-anded with a low bits +/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' +/// with a shift-right to eliminate loading the vector constant mask value. +static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); + EVT VT0 = Op0.getValueType(); + EVT VT1 = Op1.getValueType(); - // TODO: Use AssertSext to mark any nodes that have the property of producing - // all-ones or all-zeros. Then check for that node rather than particular - // opcodes. - if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT) + if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger()) return SDValue(); - // The existence of the PCMP node guarantees that we have the required SSE2 or - // AVX2 for a shift of this vector type, but there is no vector shift by - // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the - // masked compare nodes, so they should not make it here. - EVT VT0 = Op0.getValueType(); - EVT VT1 = Op1.getValueType(); - unsigned EltBitWidth = VT0.getScalarSizeInBits(); - if (VT0 != VT1 || EltBitWidth == 8) + APInt SplatVal; + if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || + !SplatVal.isMask()) return SDValue(); - assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256); + if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)) + return SDValue(); - APInt SplatVal; - if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1) + unsigned EltBitWidth = VT0.getScalarSizeInBits(); + if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) return SDValue(); SDLoc DL(N); - SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8); + unsigned ShiftVal = SplatVal.countTrailingOnes(); + SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } @@ -30747,7 +31586,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) return R; - if (SDValue ShiftRight = combinePCMPAnd1(N, DAG)) + if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; EVT VT = N->getValueType(0); @@ -30760,7 +31599,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue Op(N, 0); SmallVector<int, 1> NonceMask; // Just a placeholder. NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, DAG, DCI, Subtarget)) return SDValue(); // This routine will use CombineTo to replace N. @@ -30969,7 +31808,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && N->getOperand(1).getOpcode() == X86ISD::CMP && - N->getOperand(1).getConstantOperandVal(1) == 0 && + isNullConstant(N->getOperand(1).getOperand(1)) && N->getOperand(1).getValueType().bitsGE(MVT::i32); }; @@ -31272,6 +32111,74 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } +/// Check if truncation with saturation form type \p SrcVT to \p DstVT +/// is valid for the given \p Subtarget. +static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + // FIXME: Scalar type may be supported if we move it to vector register. + if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512) + return false; + + EVT SrcElVT = SrcVT.getScalarType(); + EVT DstElVT = DstVT.getScalarType(); + if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64) + return false; + if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32) + return false; + if (SrcVT.is512BitVector() || Subtarget.hasVLX()) + return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); + return false; +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectUSatPattern(SDValue In, EVT VT) { + if (In.getOpcode() != ISD::UMIN) + return SDValue(); + + //Saturation with truncation. We truncate from InVT to VT. + assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && + "Unexpected types for truncate operation"); + + APInt C; + if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { + // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // the element size of the destination type. + return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : + SDValue(); + } + return SDValue(); +} + +/// Detect a pattern of truncation with saturation: +/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// The types should allow to use VPMOVUS* instruction on AVX512. +/// Return the source value to be truncated or SDValue() if the pattern was not +/// matched. +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, + const X86Subtarget &Subtarget) { + if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return SDValue(); + return detectUSatPattern(In, VT); +} + +static SDValue +combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT)) + return SDValue(); + if (auto USatVal = detectUSatPattern(In, VT)) + if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + return SDValue(); +} + /// This function detects the AVG pattern between vectors of unsigned i8/i16, /// which is c = (a + b + 1) / 2, and replace this operation with the efficient /// X86ISD::AVG instruction. @@ -31664,7 +32571,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } @@ -31838,6 +32745,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); + if (SDValue Val = + detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget)) + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); @@ -32198,13 +33111,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); - auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) { - // TODO: Add extra cases where we can truncate both inputs for the - // cost of one (or none). - // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y ) + auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) { + unsigned TruncSizeInBits = VT.getScalarSizeInBits(); + + // Repeated operand, so we are only trading one output truncation for + // one input truncation. if (Op0 == Op1) return true; + // See if either operand has been extended from a smaller/equal size to + // the truncation size, allowing a truncation to combine with the extend. + unsigned Opcode0 = Op0.getOpcode(); + if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND || + Opcode0 == ISD::ZERO_EXTEND) && + Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + return true; + + unsigned Opcode1 = Op1.getOpcode(); + if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND || + Opcode1 == ISD::ZERO_EXTEND) && + Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + return true; + + // See if either operand is a single use constant which can be constant + // folded. SDValue BC0 = peekThroughOneUseBitcasts(Op0); SDValue BC1 = peekThroughOneUseBitcasts(Op1); return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || @@ -32236,7 +33166,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegalOrPromote(Opcode, VT) && - IsRepeatedOpOrOneUseConstant(Op0, Op1)) + IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } @@ -32252,7 +33182,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(Opcode, VT) && - IsRepeatedOpOrOneUseConstant(Op0, Op1)) + IsRepeatedOpOrFreeTruncation(Op0, Op1)) return TruncateArithmetic(Op0, Op1); break; } @@ -32458,6 +33388,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) return Avg; + // Try to combine truncation with unsigned saturation. + if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget)) + return Val; + // The bitcast source is a direct mmx result. // Detect bitcasts between i32 to x86mmx if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { @@ -32804,6 +33738,34 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); } +/// Do target-specific dag combines on X86ISD::ANDNP nodes. +static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + // ANDNP(0, x) -> x + if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) + return N->getOperand(1); + + // ANDNP(x, 0) -> 0 + if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode())) + return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N)); + + EVT VT = N->getValueType(0); + + // Attempt to recursively combine a bitmask ANDNP with shuffles. + if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + SDValue Op(N, 0); + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, + /*Depth*/ 1, /*HasVarMask*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. + } + + return SDValue(); +} + static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // BT ignores high bits in the bit index operand. @@ -33065,13 +34027,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue AllOnes = - DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); } return SDValue(); } + if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && + isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { + // Invert and sign-extend a boolean is the same as zero-extend and subtract + // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently + // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1. + // sext (xor Bool, -1) --> sub (zext Bool), 1 + SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); + } + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -33212,8 +34183,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Optimize x == -y --> x+y == 0 -/// x != -y --> x+y != 0 +/// Try to map a 128-bit or larger integer comparison to vector instructions +/// before type legalization splits it up into chunks. +static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"); + + // We're looking for an oversized integer equality comparison, but ignore a + // comparison with zero because that gets special treatment in EmitTest(). + SDValue X = SetCC->getOperand(0); + SDValue Y = SetCC->getOperand(1); + EVT OpVT = X.getValueType(); + unsigned OpSize = OpVT.getSizeInBits(); + if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y)) + return SDValue(); + + // TODO: Use PXOR + PTEST for SSE4.1 or later? + // TODO: Add support for AVX-512. + EVT VT = SetCC->getValueType(0); + SDLoc DL(SetCC); + if ((OpSize == 128 && Subtarget.hasSSE2()) || + (OpSize == 256 && Subtarget.hasAVX2())) { + EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8; + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + + // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. + // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq + // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne + // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq + // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne + SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY); + SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); + SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, + MVT::i32); + return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); + } + + return SDValue(); +} + static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); @@ -33222,21 +34232,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDLoc DL(N); - if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); + if (CC == ISD::SETNE || CC == ISD::SETEQ) { + EVT OpVT = LHS.getValueType(); + // 0-x == y --> x+y == 0 + // 0-x != y --> x+y != 0 + if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && + LHS.hasOneUse()) { + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } - if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); + // x == 0-y --> x+y == 0 + // x != 0-y --> x+y != 0 + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && + RHS.hasOneUse()) { + SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); + return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } + if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) + return V; + } + if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { bool IsSEXT0 = @@ -33293,56 +34309,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// Helper function of performSETCCCombine. It is to materialize "setb reg" -// as "sbb reg,reg", since it can be extended without zext and produces -// an all-ones bit which is more useful than 0/1 in some cases. -static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS, - SelectionDAG &DAG, MVT VT) { - if (VT == MVT::i8) - return DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS), - DAG.getConstant(1, DL, VT)); - assert (VT == MVT::i1 && "Unexpected type for SECCC node"); - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(X86::COND_B, DL, MVT::i8), - EFLAGS)); -} - // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); - if (CC == X86::COND_A) { - // Try to convert COND_A into COND_B in an attempt to facilitate - // materializing "setb reg". - // - // Do not flip "e > c", where "c" is a constant, because Cmp instruction - // cannot take an immediate as its first operand. - // - if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && - EFLAGS.getValueType().isInteger() && - !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { - SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), - EFLAGS.getNode()->getVTList(), - EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); - return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0)); - } - } - - // Materialize "setb reg" as "sbb reg,reg", since it can be extended without - // a zext and produces an all-ones bit which is more useful than 0/1 in some - // cases. - if (CC == X86::COND_B) - return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - // Try to simplify the EFLAGS and condition code operands. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) return getSETCC(CC, Flags, DL, DAG); @@ -33352,7 +34325,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, /// Optimize branch condition evaluation. static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc DL(N); SDValue EFLAGS = N->getOperand(3); @@ -33538,45 +34510,159 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// fold (add Y, (sete X, 0)) -> adc 0, Y -/// (add Y, (setne X, 0)) -> sbb -1, Y -/// (sub (sete X, 0), Y) -> sbb 0, Y -/// (sub (setne X, 0), Y) -> adc -1, Y -static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { +/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit +/// which is more useful than 0/1 in some cases. +static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) { SDLoc DL(N); + // "Condition code B" is also known as "the carry flag" (CF). + SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8); + SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS); + MVT VT = N->getSimpleValueType(0); + if (VT == MVT::i8) + return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT)); - // Look through ZExts. - SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); - if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) - return SDValue(); + assert(VT == MVT::i1 && "Unexpected type for SETCC node"); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB); +} + +/// If this is an add or subtract where one operand is produced by a cmp+setcc, +/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} +/// with CMP+{ADC, SBB}. +static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { + bool IsSub = N->getOpcode() == ISD::SUB; + SDValue X = N->getOperand(0); + SDValue Y = N->getOperand(1); + + // If this is an add, canonicalize a zext operand to the RHS. + // TODO: Incomplete? What if both sides are zexts? + if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND && + Y.getOpcode() != ISD::ZERO_EXTEND) + std::swap(X, Y); + + // Look through a one-use zext. + bool PeekedThroughZext = false; + if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) { + Y = Y.getOperand(0); + PeekedThroughZext = true; + } - SDValue SetCC = Ext.getOperand(0); - if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + // If this is an add, canonicalize a setcc operand to the RHS. + // TODO: Incomplete? What if both sides are setcc? + // TODO: Should we allow peeking through a zext of the other operand? + if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC && + Y.getOpcode() != X86ISD::SETCC) + std::swap(X, Y); + + if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse()) return SDValue(); - X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + SDLoc DL(N); + EVT VT = N->getValueType(0); + X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); + + if (CC == X86::COND_B) { + // X + SETB Z --> X + (mask SBB Z, Z) + // X - SETB Z --> X - (mask SBB Z, Z) + // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY? + SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG); + if (SBB.getValueSizeInBits() != VT.getSizeInBits()) + SBB = DAG.getZExtOrTrunc(SBB, DL, VT); + return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + } + + if (CC == X86::COND_A) { + SDValue EFLAGS = Y->getOperand(1); + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG); + if (SBB.getValueSizeInBits() != VT.getSizeInBits()) + SBB = DAG.getZExtOrTrunc(SBB, DL, VT); + return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB); + } + } + if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); - SDValue Cmp = SetCC.getOperand(1); + SDValue Cmp = Y.getOperand(1); if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || !X86::isZeroNode(Cmp.getOperand(1)) || !Cmp.getOperand(0).getValueType().isInteger()) return SDValue(); - SDValue CmpOp0 = Cmp.getOperand(0); - SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, - DAG.getConstant(1, DL, CmpOp0.getValueType())); + // (cmp Z, 1) sets the carry flag if Z is 0. + SDValue Z = Cmp.getOperand(0); + SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, + DAG.getConstant(1, DL, Z.getValueType())); + + SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); - SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) + // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) - return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, - DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(-1ULL, DL, OtherVal.getValueType()), - NewCmp); - return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, - DL, OtherVal.getValueType(), OtherVal, - DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, + DAG.getConstant(-1ULL, DL, VT), NewCmp); + + // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) + // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) + return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, + DAG.getConstant(0, DL, VT), NewCmp); +} + +static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue MulOp = N->getOperand(0); + SDValue Phi = N->getOperand(1); + + if (MulOp.getOpcode() != ISD::MUL) + std::swap(MulOp, Phi); + if (MulOp.getOpcode() != ISD::MUL) + return SDValue(); + + ShrinkMode Mode; + if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode)) + return SDValue(); + + EVT VT = N->getValueType(0); + + unsigned RegSize = 128; + if (Subtarget.hasBWI()) + RegSize = 512; + else if (Subtarget.hasAVX2()) + RegSize = 256; + unsigned VectorSize = VT.getVectorNumElements() * 16; + // If the vector size is less than 128, or greater than the supported RegSize, + // do not use PMADD. + if (VectorSize < 128 || VectorSize > RegSize) + return SDValue(); + + SDLoc DL(N); + EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + VT.getVectorNumElements()); + EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + VT.getVectorNumElements() / 2); + + // Shrink the operands of mul. + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); + + // Madd vector size is half of the original vector size + SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1); + // Fill the rest of the output with 0 + SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); + return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, @@ -33656,6 +34742,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (Flags->hasVectorReduction()) { if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) return Sad; + if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) + return MAdd; } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); @@ -33667,7 +34755,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, true)) return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1); - return OptimizeConditionalInDecrement(N, DAG); + return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, @@ -33700,36 +34788,44 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, isHorizontalBinOp(Op0, Op1, false)) return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1); - return OptimizeConditionalInDecrement(N, DAG); + return combineAddOrSubToADCOrSBB(N, DAG); } static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalize()) + return SDValue(); + SDLoc DL(N); unsigned Opcode = N->getOpcode(); MVT VT = N->getSimpleValueType(0); MVT SVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = SVT.getSizeInBits(); + SDValue Op = N->getOperand(0); MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); - unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + unsigned OpEltSizeInBits = OpEltVT.getSizeInBits(); + unsigned InputBits = OpEltSizeInBits * NumElts; // Perform any constant folding. // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. - if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - unsigned NumDstElts = VT.getVectorNumElements(); - SmallBitVector Undefs(NumDstElts, false); - SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0)); - for (unsigned i = 0; i != NumDstElts; ++i) { - SDValue OpElt = Op.getOperand(i); - if (OpElt.getOpcode() == ISD::UNDEF) { - Undefs[i] = true; + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { + APInt Undefs(NumElts, 0); + SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0)); + bool IsZEXT = + (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); + for (unsigned i = 0; i != NumElts; ++i) { + if (UndefElts[i]) { + Undefs.setBit(i); continue; } - APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue(); - Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits()) - : Cst.sextOrTrunc(SVT.getSizeInBits()); + Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits) + : EltBits[i].sextOrTrunc(EltSizeInBits); } return getConstVector(Vals, Undefs, VT, DAG, DL); } @@ -33829,7 +34925,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, if (N->getOperand(0) == N->getOperand(1)) { if (N->getOpcode() == X86ISD::PCMPEQ) - return getOnesVector(VT, Subtarget, DAG, DL); + return getOnesVector(VT, DAG, DL); if (N->getOpcode() == X86ISD::PCMPGT) return getZeroVector(VT, Subtarget, DAG, DL); } @@ -33837,6 +34933,98 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue SubVec = N->getOperand(1); + SDValue Idx = N->getOperand(2); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + MVT OpVT = N->getSimpleValueType(0); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // If this is an insert of an extract, combine to a shuffle. Don't do this + // if the insert or extract can be represented with a subvector operation. + if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && + SubVec.getOperand(0).getSimpleValueType() == OpVT && + (IdxVal != 0 || !Vec.isUndef())) { + int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue(); + if (ExtIdxVal != 0) { + int VecNumElts = OpVT.getVectorNumElements(); + int SubVecNumElts = SubVecVT.getVectorNumElements(); + SmallVector<int, 64> Mask(VecNumElts); + // First create an identity shuffle mask. + for (int i = 0; i != VecNumElts; ++i) + Mask[i] = i; + // Now insert the extracted portion. + for (int i = 0; i != SubVecNumElts; ++i) + Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; + + return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); + } + } + + // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte + // load: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr + 16), Elts/2) + // --> load32 addr + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr + 32), Elts/2) + // --> load64 addr + // or a 16-byte or 32-byte broadcast: + // (insert_subvector (insert_subvector undef, (load16 addr), 0), + // (load16 addr), Elts/2) + // --> X86SubVBroadcast(load16 addr) + // or: + // (insert_subvector (insert_subvector undef, (load32 addr), 0), + // (load32 addr), Elts/2) + // --> X86SubVBroadcast(load32 addr) + if ((IdxVal == OpVT.getVectorNumElements() / 2) && + Vec.getOpcode() == ISD::INSERT_SUBVECTOR && + OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through bitcasts to get to the load. + if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget.getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = {SubVec2, SubVec}; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } + } + // If lower/upper loads are the same and the only users of the load, then + // lower to a VBROADCASTF128/VBROADCASTI128/etc. + if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { + if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && + SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); + } + } + // If this is subv_broadcast insert into both halves, use a larger + // subv_broadcast. + if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { + return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, + SubVec.getOperand(0)); + } + } + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -33845,6 +35033,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI, Subtarget); + case X86ISD::PEXTRW: + case X86ISD::PEXTRB: + return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); + case ISD::INSERT_SUBVECTOR: + return combineInsertSubvector(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); @@ -33870,6 +35063,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: @@ -33884,12 +35078,18 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); - case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget); - case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget); + case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); + case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); case X86ISD::VSHLI: - case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget); + case X86ISD::VSRAI: + case X86ISD::VSRLI: + return combineVectorShiftImm(N, DAG, DCI, Subtarget); + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VSEXT: case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); + case X86ISD::PINSRB: + case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::PALIGNR: @@ -34717,10 +35917,20 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return Res; } - // 'A' means EAX + EDX. + // 'A' means [ER]AX + [ER]DX. if (Constraint == "A") { - Res.first = X86::EAX; - Res.second = &X86::GR32_ADRegClass; + if (Subtarget.is64Bit()) { + Res.first = X86::RAX; + Res.second = &X86::GR64_ADRegClass; + } else if (Subtarget.is32Bit()) { + Res.first = X86::EAX; + Res.second = &X86::GR32_ADRegClass; + } else if (Subtarget.is16Bit()) { + Res.first = X86::AX; + Res.second = &X86::GR16_ADRegClass; + } else { + llvm_unreachable("Expecting 64, 32 or 16 bit subtarget"); + } return Res; } return Res; @@ -34812,7 +36022,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on x86 is expensive. However, when aggressively optimizing // for code size, we prefer to use a div instruction, as it is usually smaller // than the alternative sequence. @@ -34820,8 +36030,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, - Attribute::MinSize); + bool OptSize = + Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 37f9353042b18..ab4910daca02b 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -149,8 +149,7 @@ namespace llvm { WrapperRIP, /// Copies a 64-bit value from the low word of an XMM vector - /// to an MMX vector. If you think this is too close to the previous - /// mnemonic, so do I; blame Intel. + /// to an MMX vector. MOVDQ2Q, /// Copies a 32-bit value from the low word of a MMX @@ -179,7 +178,7 @@ namespace llvm { /// Insert the lower 16-bits of a 32-bit value to a vector, /// corresponds to X86::PINSRW. - PINSRW, MMX_PINSRW, + PINSRW, /// Shuffle 16 8-bit values within a vector. PSHUFB, @@ -195,21 +194,21 @@ namespace llvm { /// Blend where the selector is an immediate. BLENDI, - /// Blend where the condition has been shrunk. - /// This is used to emphasize that the condition mask is - /// no more valid for generic VSELECT optimizations. + /// Dynamic (non-constant condition) vector blend where only the sign bits + /// of the condition elements are used. This is used to enforce that the + /// condition mask is not valid for generic VSELECT optimizations. SHRUNKBLEND, /// Combined add and sub on an FP vector. ADDSUB, // FP vector ops with rounding mode. - FADD_RND, - FSUB_RND, - FMUL_RND, - FDIV_RND, - FMAX_RND, - FMIN_RND, + FADD_RND, FADDS_RND, + FSUB_RND, FSUBS_RND, + FMUL_RND, FMULS_RND, + FDIV_RND, FDIVS_RND, + FMAX_RND, FMAXS_RND, + FMIN_RND, FMINS_RND, FSQRT_RND, FSQRTS_RND, // FP vector get exponent. @@ -239,9 +238,6 @@ namespace llvm { FHADD, FHSUB, - // Integer absolute value - ABS, - // Detect Conflicts Within a Vector CONFLICT, @@ -251,6 +247,9 @@ namespace llvm { /// Commutative FMIN and FMAX. FMAXC, FMINC, + /// Scalar intrinsic floating point max and min. + FMAXS, FMINS, + /// Floating point reciprocal-sqrt and reciprocal approximation. /// Note that these typically require refinement /// in order to obtain suitable precision. @@ -320,6 +319,9 @@ namespace llvm { // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, + // Shifts of mask registers. + KSHIFTL, KSHIFTR, + // Bit rotate by immediate VROTLI, VROTRI, @@ -443,8 +445,7 @@ namespace llvm { // Broadcast subvector to vector. SUBV_BROADCAST, - // Insert/Extract vector element. - VINSERT, + // Extract vector element. VEXTRACT, /// SSE4A Extraction and Insertion. @@ -686,6 +687,9 @@ namespace llvm { unsigned getJumpTableEncoding() const override; bool useSoftFloat() const override; + void markLibCallAttributes(MachineFunction *MF, unsigned CC, + ArgListTy &Args) const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { return MVT::i8; } @@ -806,8 +810,17 @@ namespace llvm { return false; } + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; + bool hasAndNotCompare(SDValue Y) const override; + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { + return VT.isScalarInteger(); + } + + /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. + MVT hasFastEqualityCompare(unsigned NumBits) const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -817,11 +830,13 @@ namespace llvm { void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; /// Determine the number of bits in the operation that are sign bits. unsigned ComputeNumSignBitsForTargetNode(SDValue Op, + const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override; @@ -984,6 +999,10 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool convertSelectOfConstantsToMath() const override { + return true; + } + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; @@ -1035,7 +1054,7 @@ namespace llvm { /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; - bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool supportSwiftError() const override; @@ -1076,7 +1095,8 @@ namespace llvm { CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const; + SmallVectorImpl<SDValue> &InVals, + uint32_t *RegMask) const; SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl<ISD::InputArg> &ArgInfo, const SDLoc &dl, SelectionDAG &DAG, @@ -1138,8 +1158,7 @@ namespace llvm { SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index ba1aede3c1a0c..08b501ff20bf0 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -38,7 +38,9 @@ multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; } -multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, bit Commutable = 0, + string Ver = ""> { + let isCommutable = Commutable in def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, [(set VR64:$dst, (!cast<Intrinsic>( !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; @@ -63,25 +65,25 @@ multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { (bitconvert (load_mmx addr:$src))))]>; } -defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", 1>; defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; -defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; -defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", 1>; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", 1>; defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; -defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", 1>; defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; -defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; -defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", 1>; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", 1>; defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; -defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", 1>; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", @@ -98,6 +100,6 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", // "3DNowA" instructions defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; -defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; -defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", 0, "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", 0, "a">; defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 230d1700b8d29..c38c13bb97571 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -34,13 +34,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1", "v" # NumElts # "i1")); - // The GPR register class that can hold the write mask. Use GR8 for fewer - // than 8 elements. Use shift-right and equal to work around the lack of - // !lt in tablegen. - RegisterClass MRC = - !cast<RegisterClass>("GR" # - !if (!eq (!srl(NumElts, 3), 0), 8, NumElts)); - // Suffix used in the instruction mnemonic. string Suffix = suffix; @@ -69,6 +62,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, // The corresponding memory operand, e.g. i512mem for VR512. X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem"); X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem"); + // FP scalar memory operand for intrinsics - ssmem/sdmem. + Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"), + !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?)); // Load patterns // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 @@ -89,6 +85,12 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), + !cast<ComplexPattern>("sse_load_f32"), + !if (!eq (EltTypeName, "f64"), + !cast<ComplexPattern>("sse_load_f64"), + ?)); + // The corresponding float type, e.g. v16f32 for v16i32 // Note: For EltSize < 32, FloatVT is illegal and TableGen // fails to compile, so we choose FloatVT = VT @@ -207,7 +209,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, Pattern, itin>; // Prefer over VMOV*rrk Pat<> - let AddedComplexity = 20, isCommutable = IsKCommutable in + let isCommutable = IsKCommutable in def NAME#k: AVX512<O, F, Outs, MaskingIns, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# "$dst {${mask}}, "#IntelSrcAsm#"}", @@ -219,7 +221,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, // Zero mask does not add any restrictions to commute operands transformation. // So, it is Ok to use IsCommutable instead of IsKCommutable. - let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<> + let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# "$dst {${mask}} {z}, "#IntelSrcAsm#"}", @@ -250,6 +252,23 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, MaskingConstraint, NoItinerary, IsCommutable, IsKCommutable>; +// Similar to AVX512_maskable_common, but with scalar types. +multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + SDNode Select = vselect, + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0, + bit IsKCommutable = 0> : + AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [], [], [], + MaskingConstraint, NoItinerary, IsCommutable, + IsKCommutable>; + // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the // perserved vector elements come from a new dummy input operand tied to $dst. @@ -460,7 +479,7 @@ def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), } let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in { + isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in { def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", [(set VR128X:$dst, (v4i32 immAllZerosV))]>; def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", @@ -470,7 +489,7 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in { + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in { def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", @@ -484,7 +503,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To PatFrag vinsert_insert> { let ExeDomain = To.ExeDomain in { defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst), - (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3), + (ins To.RC:$src1, From.RC:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts, "$src3, $src2, $src1", "$src1, $src2, $src3", (vinsert_insert:$src3 (To.VT To.RC:$src1), @@ -492,7 +511,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To (iPTR imm))>, AVX512AIi8Base, EVEX_4V; defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst), - (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3), + (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts, "$src3, $src2, $src1", "$src1, $src2, $src3", (vinsert_insert:$src3 (To.VT To.RC:$src1), @@ -625,14 +644,14 @@ multiclass vextract_for_size<int Opcode, // vextract_extract), we interesting only in patterns without mask, // intrinsics pattern match generated bellow. defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), - (ins From.RC:$src1, i32u8imm:$idx), + (ins From.RC:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x" # To.NumElts, "$idx, $src1", "$src1, $idx", [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)))]>, AVX512AIi8Base, EVEX; def mr : AVX512AIi8<Opcode, MRMDestMem, (outs), - (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx), + (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x" # To.NumElts # "\t{$idx, $src1, $dst|$dst, $src1, $idx}", [(store (To.VT (vextract_extract:$idx @@ -642,7 +661,7 @@ multiclass vextract_for_size<int Opcode, let mayStore = 1, hasSideEffects = 0 in def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs), (ins To.MemOp:$dst, To.KRCWM:$mask, - From.RC:$src1, i32u8imm:$idx), + From.RC:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x" # To.NumElts # "\t{$idx, $src1, $dst {${mask}}|" "$dst {${mask}}, $src1, $idx}", @@ -846,32 +865,20 @@ def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs), // broadcast with a scalar argument. multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { - - let isCodeGenOnly = 1 in { - def r_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst), - (ins SrcInfo.FRC:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}", - [(set DestInfo.RC:$dst, (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)))]>, - Requires<[HasAVX512]>, T8PD, EVEX; - - let Constraints = "$src0 = $dst" in - def rk_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst), - (ins DestInfo.RC:$src0, DestInfo.KRCWM:$mask, SrcInfo.FRC:$src), - OpcodeStr#"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(set DestInfo.RC:$dst, - (vselect DestInfo.KRCWM:$mask, - (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)), - DestInfo.RC:$src0))]>, - Requires<[HasAVX512]>, T8PD, EVEX, EVEX_K; - - def rkz_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst), - (ins DestInfo.KRCWM:$mask, SrcInfo.FRC:$src), - OpcodeStr#"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", - [(set DestInfo.RC:$dst, - (vselect DestInfo.KRCWM:$mask, - (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)), - DestInfo.ImmAllZerosV))]>, - Requires<[HasAVX512]>, T8PD, EVEX, EVEX_KZ; - } // let isCodeGenOnly = 1 in + def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#r) + (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; + def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, + (X86VBroadcast SrcInfo.FRC:$src), + DestInfo.RC:$src0)), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#rk) + DestInfo.RC:$src0, DestInfo.KRCWM:$mask, + (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; + def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, + (X86VBroadcast SrcInfo.FRC:$src), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz) + DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; } multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, @@ -892,7 +899,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, (SrcInfo.VT (scalar_to_vector (SrcInfo.ScalarLdFrag addr:$src))))), (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>; - let AddedComplexity = 20 in def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, (X86VBroadcast (SrcInfo.VT (scalar_to_vector @@ -900,7 +906,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, DestInfo.RC:$src0)), (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk) DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>; - let AddedComplexity = 30 in def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, (X86VBroadcast (SrcInfo.VT (scalar_to_vector @@ -951,39 +956,42 @@ def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), (VBROADCASTSDZm addr:$src)>; multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _, + SDPatternOperator OpNode, RegisterClass SrcRC> { + let ExeDomain = _.ExeDomain in defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins SrcRC:$src), "vpbroadcast"##_.Suffix, "$src", "$src", - (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX; + (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX; } multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _, + SDPatternOperator OpNode, RegisterClass SrcRC, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512; + defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256; - defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128; + defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128; } } let isCodeGenOnly = 1 in { -defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8, - HasBWI>; -defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16, - HasBWI>; +defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, + X86VBroadcast, GR8, HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, + X86VBroadcast, GR16, HasBWI>; } let isAsmParserOnly = 1 in { defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, - GR32, HasBWI>; + null_frag, GR32, HasBWI>; defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, - GR32, HasBWI>; + null_frag, GR32, HasBWI>; } -defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, - HasAVX512>; -defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, - HasAVX512>, VEX_W; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, + X86VBroadcast, GR32, HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, + X86VBroadcast, GR64, HasAVX512>, VEX_W; def : Pat <(v16i32 (X86vzext VK16WM:$mask)), (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; @@ -1035,7 +1043,18 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, AVX5128IBase, EVEX; } +let Predicates = [HasAVX512] in { + // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. + def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))), + (VPBROADCASTQZm addr:$src)>; +} + let Predicates = [HasVLX, HasBWI] in { + // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + (VPBROADCASTQZ128m addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + (VPBROADCASTQZ256m addr:$src)>; // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. // This means we'll encounter truncated i32 loads; match that here. def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), @@ -1075,18 +1094,12 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4f64 VR256X:$src), 1)>; def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4i64 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v16i16 VR256X:$src), 1)>; @@ -1098,46 +1111,6 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF64x4Zrr - (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; - -def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; } let Predicates = [HasVLX] in { @@ -1209,25 +1182,6 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), - (VINSERTF64x4Zrr - (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; - def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), (VBROADCASTF64X4rm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), @@ -1265,25 +1219,6 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v8i32 VR256X:$src), 1)>; - -def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), - (VINSERTF32x8Zrr - (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), - (VINSERTI32x8Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; } multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, @@ -1310,6 +1245,13 @@ defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", avx512vl_f32_info, avx512vl_f64_info>; +let Predicates = [HasVLX] in { +def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))), + (VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>; +def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))), + (VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; +} + def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), @@ -1604,13 +1546,13 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc)>, EVEX_4V; + let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc), "vcmp${cc}"#_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, @@ -1629,6 +1571,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + let mayLoad = 1 in defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), @@ -1667,8 +1610,10 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> } let Predicates = [HasAVX512] in { + let ExeDomain = SSEPackedSingle in defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>, AVX512XSIi8Base; + let ExeDomain = SSEPackedDouble in defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>, AVX512XDIi8Base, VEX_W; } @@ -2087,22 +2032,20 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; - let AddedComplexity = 20 in { - def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), - (ins _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix## - "\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set _.KRC:$dst, - (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2)))], NoItinerary>; - def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), - (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix## - "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - [(set _.KRC:$dst,(or _.KRCWM:$mask, + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2))))], NoItinerary>, EVEX_K; - } + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; } } @@ -2242,28 +2185,26 @@ let Predicates = [HasBWI] in { // GR from/to mask register def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), - (COPY_TO_REGCLASS GR16:$src, VK16)>; + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>; def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), - (COPY_TO_REGCLASS VK16:$src, GR16)>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>; def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), - (COPY_TO_REGCLASS GR8:$src, VK8)>; + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>; def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), - (COPY_TO_REGCLASS VK8:$src, GR8)>; + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>; def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), (KMOVWrk VK16:$src)>; def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), - (i32 (INSERT_SUBREG (IMPLICIT_DEF), - (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>; + (COPY_TO_REGCLASS VK16:$src, GR32)>; def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), - (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>; + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>; def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), (KMOVBrk VK8:$src)>, Requires<[HasDQI]>; def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), - (i32 (INSERT_SUBREG (IMPLICIT_DEF), - (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>; + (COPY_TO_REGCLASS VK8:$src, GR32)>; def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (COPY_TO_REGCLASS GR32:$src, VK32)>; @@ -2296,20 +2237,20 @@ let Predicates = [HasDQI] in { let Predicates = [HasAVX512, NoDQI] in { def : Pat<(store VK1:$src, addr:$dst), (MOV8mr addr:$dst, - (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), - sub_8bit))>; + (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), + sub_8bit)))>; def : Pat<(store VK2:$src, addr:$dst), (MOV8mr addr:$dst, - (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)), - sub_8bit))>; + (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)), + sub_8bit)))>; def : Pat<(store VK4:$src, addr:$dst), (MOV8mr addr:$dst, - (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)), - sub_8bit))>; + (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)), + sub_8bit)))>; def : Pat<(store VK8:$src, addr:$dst), (MOV8mr addr:$dst, - (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), - sub_8bit))>; + (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), + sub_8bit)))>; def : Pat<(v8i1 (load addr:$src)), (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; @@ -2340,44 +2281,41 @@ let Predicates = [HasBWI] in { let Predicates = [HasAVX512] in { def : Pat<(i1 (trunc (i64 GR64:$src))), - (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), - (i32 1))), VK1)>; + (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit), + (i32 1)), VK1)>; def : Pat<(i1 (trunc (i32 GR32:$src))), - (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>; + (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>; def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))), (COPY_TO_REGCLASS GR32:$src, VK1)>; def : Pat<(i1 (trunc (i8 GR8:$src))), (COPY_TO_REGCLASS - (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit), (i32 1))), - VK1)>; + (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit), (i32 1)), VK1)>; def : Pat<(i1 (trunc (i16 GR16:$src))), (COPY_TO_REGCLASS - (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit), (i32 1))), - VK1)>; + (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR16:$src, sub_16bit), (i32 1)), VK1)>; def : Pat<(i32 (zext VK1:$src)), - (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; + (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), (COPY_TO_REGCLASS VK1:$src, GR32)>; def : Pat<(i8 (zext VK1:$src)), (EXTRACT_SUBREG - (AND32ri8 (KMOVWrk - (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; + (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>; def : Pat<(i8 (anyext VK1:$src)), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>; def : Pat<(i64 (zext VK1:$src)), - (AND64ri8 (SUBREG_TO_REG (i64 0), - (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; + (SUBREG_TO_REG (i64 0), + (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>; def : Pat<(i64 (anyext VK1:$src)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), @@ -2385,8 +2323,7 @@ let Predicates = [HasAVX512] in { def : Pat<(i16 (zext VK1:$src)), (EXTRACT_SUBREG - (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), - sub_16bit)>; + (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>; def : Pat<(i16 (anyext VK1:$src)), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>; @@ -2440,15 +2377,6 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>; -multiclass avx512_mask_unop_int<string IntName, string InstName> { - let Predicates = [HasAVX512] in - def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w") - (i16 GR16:$src)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>; -} -defm : avx512_mask_unop_int<"knot", "KNOT">; - // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit let Predicates = [HasAVX512, NoDQI] in def : Pat<(vnot VK8:$src), @@ -2497,21 +2425,6 @@ defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>; defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; -multiclass avx512_mask_binop_int<string IntName, string InstName> { - let Predicates = [HasAVX512] in - def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w") - (i16 GR16:$src1), (i16 GR16:$src2)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), - (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; -} - -defm : avx512_mask_binop_int<"kand", "KAND">; -defm : avx512_mask_binop_int<"kandn", "KANDN">; -defm : avx512_mask_binop_int<"kor", "KOR">; -defm : avx512_mask_binop_int<"kxnor", "KXNOR">; -defm : avx512_mask_binop_int<"kxor", "KXOR">; - multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode, Instruction Inst> { // With AVX512F, 8-bit mask is promoted to 16-bit mask, @@ -2613,8 +2526,8 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, } } -defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; -defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>; +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; // Mask setting all 0s or 1s multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { @@ -2625,7 +2538,6 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { } multiclass avx512_mask_setop_w<PatFrag Val> { - defm B : avx512_mask_setop<VK8, v8i1, Val>; defm W : avx512_mask_setop<VK16, v16i1, Val>; defm D : avx512_mask_setop<VK32, v32i1, Val>; defm Q : avx512_mask_setop<VK64, v64i1, Val>; @@ -2642,9 +2554,11 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; - def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; - def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; - def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; + let AddedComplexity = 10 in { // To optimize isel table. + def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>; + def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; + def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>; + } } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 @@ -2695,12 +2609,12 @@ def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), // Patterns for kmask shift multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> { - def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))), + def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))), (VT (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16), (I8Imm $imm)), RC))>; - def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))), + def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))), (VT (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16), (I8Imm $imm)), @@ -2738,7 +2652,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], _.ExeDomain>, EVEX; - let Constraints = "$src0 = $dst" in { + let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", @@ -3160,6 +3074,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src) "vmovq\t{$src, $dst|$dst, $src}", [(set FR64X:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; +def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>, + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64X:$src))], @@ -3272,20 +3190,22 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, (scalar_to_vector _.FRC:$src2))))], _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), - (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + (ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", "$dst {${mask}} {z}, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, - (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))), _.ImmAllZerosV)))], _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ; let Constraints = "$src0 = $dst" in def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, - (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))), (_.VT _.RC:$src0))))], _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K; let canFoldAsLoad = 1, isReMaterializable = 1 in @@ -3335,8 +3255,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk) (COPY_TO_REGCLASS _.FRC:$src2, _.RC), (COPY_TO_REGCLASS GR32:$mask, VK1WM), - (_.VT _.RC:$src0), - (COPY_TO_REGCLASS _.FRC:$src1, _.RC)), + (_.VT _.RC:$src0), _.FRC:$src1), _.RC)>; def : Pat<(_.VT (OpNode _.RC:$src0, @@ -3346,10 +3265,8 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.EltVT ZeroFP))))))), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz) (COPY_TO_REGCLASS GR32:$mask, VK1WM), - (_.VT _.RC:$src0), - (COPY_TO_REGCLASS _.FRC:$src1, _.RC)), + (_.VT _.RC:$src0), _.FRC:$src1), _.RC)>; - } multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, @@ -3359,14 +3276,31 @@ def : Pat<(masked_store addr:$dst, Mask, (_.info512.VT (insert_subvector undef, (_.info256.VT (insert_subvector undef, (_.info128.VT _.info128.RC:$src), - (i64 0))), - (i64 0)))), + (iPTR 0))), + (iPTR 0)))), (!cast<Instruction>(InstrStr#mrk) addr:$dst, (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; } +multiclass avx512_store_scalar_lowering_subreg<string InstrStr, + AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC, + SubRegIndex subreg> { + +def : Pat<(masked_store addr:$dst, Mask, + (_.info512.VT (insert_subvector undef, + (_.info256.VT (insert_subvector undef, + (_.info128.VT _.info128.RC:$src), + (iPTR 0))), + (iPTR 0)))), + (!cast<Instruction>(InstrStr#mrk) addr:$dst, + (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + +} + multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, dag Mask, RegisterClass MaskRC> { @@ -3374,7 +3308,7 @@ def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (masked_load addr:$srcAddr, Mask, (_.info512.VT (bitconvert (v16i32 immAllZerosV))))), - (i64 0))), + (iPTR 0))), (!cast<Instruction>(InstrStr#rmkz) (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), addr:$srcAddr)>; @@ -3384,53 +3318,81 @@ def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (insert_subvector undef, (_.info256.VT (insert_subvector undef, (_.info128.VT (X86vzmovl _.info128.RC:$src)), - (i64 0))), - (i64 0))))), - (i64 0))), + (iPTR 0))), + (iPTR 0))))), + (iPTR 0))), (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)), addr:$srcAddr)>; } +multiclass avx512_load_scalar_lowering_subreg<string InstrStr, + AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC, + SubRegIndex subreg> { + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (bitconvert + (v16i32 immAllZerosV))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmkz) + (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (insert_subvector undef, + (_.info256.VT (insert_subvector undef, + (_.info128.VT (X86vzmovl _.info128.RC:$src)), + (iPTR 0))), + (iPTR 0))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, + (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)), + addr:$srcAddr)>; + +} + defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; -defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, - (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>; -defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info, - (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; -defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, - (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>; -defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info, - (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), - VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>; def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), - VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), - (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, - (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2), "vmovss.s", "$src2, $src1", "$src1, $src2", []>, XS, EVEX_4V, VEX_LIG; let hasSideEffects = 0 in -defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, - (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), +defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, + (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2), "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, XD, EVEX_4V, VEX_LIG, VEX_W; @@ -3439,31 +3401,31 @@ let Predicates = [HasAVX512] in { // Move scalar to XMM zero-extended, zeroing a VR128X then do a // MOVS{S,D} to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))), - (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>; + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), - (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), - (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))), - (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>; + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>; } // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4i32 (V_SET0)), + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>; def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4i32 (V_SET0)), + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>; let AddedComplexity = 20 in { @@ -3525,11 +3487,11 @@ let Predicates = [HasAVX512] in { } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)), + (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)), + (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), @@ -3538,18 +3500,18 @@ let Predicates = [HasAVX512] in { // Move low f64 and clear high bits. def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (V_SET0)), + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (V_SET0)), + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; // Extract and store. @@ -3582,10 +3544,6 @@ let Predicates = [HasAVX512] in { (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)), (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; - def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; - def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; // 256-bit variants def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)), @@ -3635,6 +3593,8 @@ let Predicates = [HasAVX512] in { } // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. let AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), @@ -3669,42 +3629,26 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; } - -def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - -def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>; - -def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>; - //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// let SchedRW = [WriteLoad] in { def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], - SSEPackedInt>, EVEX, T8PD, EVEX_V512, + [], SSEPackedInt>, EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>; let Predicates = [HasVLX] in { def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), (ins i256mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))], - SSEPackedInt>, EVEX, T8PD, EVEX_V256, + [], SSEPackedInt>, EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>; def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))], - SSEPackedInt>, EVEX, T8PD, EVEX_V128, + [], SSEPackedInt>, EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>; } } @@ -4150,8 +4094,7 @@ let Predicates = [HasDQI, NoVLX] in { //===----------------------------------------------------------------------===// multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, OpndItins itins, - bit IsCommutable = 0> { + X86VectorVTInfo _, bit IsCommutable = 0> { defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -4159,7 +4102,7 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (bitconvert (_.VT _.RC:$src2)))), (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2)))), - itins.rr, IsCommutable>, + IIC_SSE_BIT_P_RR, IsCommutable>, AVX512BIBase, EVEX_4V; defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -4169,14 +4112,13 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (bitconvert (_.LdFrag addr:$src2)))), (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))))), - itins.rm>, + IIC_SSE_BIT_P_RM>, AVX512BIBase, EVEX_4V; } multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, OpndItins itins, - bit IsCommutable = 0> : - avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { + X86VectorVTInfo _, bit IsCommutable = 0> : + avx512_logic_rm<opc, OpcodeStr, OpNode, _, IsCommutable> { defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", @@ -4189,58 +4131,48 @@ multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (bitconvert (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))))), - itins.rm>, + IIC_SSE_BIT_P_RM>, AVX512BIBase, EVEX_4V, EVEX_B; } multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, OpndItins itins, - Predicate prd, bit IsCommutable = 0> { - let Predicates = [prd] in - defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + AVX512VLVectorVTInfo VTInfo, + bit IsCommutable = 0> { + let Predicates = [HasAVX512] in + defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, IsCommutable>, EVEX_V512; - let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, IsCommutable>, EVEX_V128; } } multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins, Predicate prd, bit IsCommutable = 0> { defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, - itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>; + IsCommutable>, EVEX_CD8<32, CD8VF>; } multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins, Predicate prd, bit IsCommutable = 0> { defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, - itins, prd, IsCommutable>, - VEX_W, EVEX_CD8<64, CD8VF>; + IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; } multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, - SDNode OpNode, OpndItins itins, Predicate prd, - bit IsCommutable = 0> { - defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd, - IsCommutable>; - - defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd, - IsCommutable>; + SDNode OpNode, bit IsCommutable = 0> { + defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, IsCommutable>; + defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, IsCommutable>; } -defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, - SSE_INTALU_ITINS_P, HasAVX512, 1>; -defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, - SSE_INTALU_ITINS_P, HasAVX512, 1>; -defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, - SSE_INTALU_ITINS_P, HasAVX512, 1>; -defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, - SSE_INTALU_ITINS_P, HasAVX512, 0>; +defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>; +defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>; +defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>; +defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -4252,16 +4184,16 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_CURRENT)), + (_.VT (VecNode _.RC:$src1, _.RC:$src2, + (i32 FROUND_CURRENT))), itins.rr>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (VecNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i32 FROUND_CURRENT)), + (_.VT (VecNode _.RC:$src1, + _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT))), itins.rm>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), @@ -4291,13 +4223,43 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo EVEX_B, EVEX_RC; } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - SDNode VecNode, OpndItins itins, bit IsCommutable> { - let ExeDomain = _.ExeDomain in + SDNode OpNode, SDNode VecNode, SDNode SaeNode, + OpndItins itins, bit IsCommutable> { + let ExeDomain = _.ExeDomain in { + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (VecNode _.RC:$src1, _.RC:$src2)), + itins.rr>; + + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (VecNode _.RC:$src1, + _.ScalarIntMemCPat:$src2)), + itins.rm>; + + let isCodeGenOnly = 1, Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], + itins.rr> { + let isCommutable = IsCommutable; + } + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))], itins.rm>; + } + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 FROUND_NO_EXC))>, EVEX_B; + } } multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -4316,31 +4278,29 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode VecNode, + SDNode VecNode, SDNode SaeNode, SizeItins itins, bit IsCommutable> { - defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, - itins.s, IsCommutable>, - avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode, - itins.s, IsCommutable>, + defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode, + VecNode, SaeNode, itins.s, IsCommutable>, XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, - itins.d, IsCommutable>, - avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode, - itins.d, IsCommutable>, + defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode, + VecNode, SaeNode, itins.d, IsCommutable>, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } -defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>; -defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>; -defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>; -defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>; -defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>; -defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>; +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>; +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>; +defm VMIN : avx512_binop_s_sae <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds, + SSE_ALU_ITINS_S, 0>; +defm VMAX : avx512_binop_s_sae <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds, + SSE_ALU_ITINS_S, 0>; // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, OpndItins itins> { - let isCodeGenOnly = 1, Predicates = [HasAVX512] in { + let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4598,6 +4558,7 @@ let Predicates = [HasVLX,HasDQI] in { multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -4613,10 +4574,12 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>, EVEX_4V, EVEX_B; + } } multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", @@ -4627,6 +4590,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode _.RC:$src1, (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>; + } } multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> { @@ -4899,6 +4863,33 @@ defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; +// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + VR128X:$src2)), sub_ymm)>; + + def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + VR128X:$src2)), sub_xmm)>; + + def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; +} + //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// @@ -4932,6 +4923,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; } + multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in @@ -4955,12 +4947,13 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, } // Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> { - let Predicates = [HasBWI, NoVLX] in { +multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr, + SDNode OpNode, list<Predicate> p> { + let Predicates = p in { def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), (_.info256.VT _.info256.RC:$src2))), (EXTRACT_SUBREG - (!cast<Instruction>(NAME#"WZrr") + (!cast<Instruction>(OpcodeStr#"Zrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; @@ -4968,13 +4961,12 @@ multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> { def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), (_.info128.VT _.info128.RC:$src2))), (EXTRACT_SUBREG - (!cast<Instruction>(NAME#"WZrr") + (!cast<Instruction>(OpcodeStr#"Zrr") (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; } } - multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -4990,19 +4982,22 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, } defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>, - avx512_var_shift_w_lowering<avx512vl_i16_info, shl>; + avx512_var_shift_w<0x12, "vpsllvw", shl>; defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>, - avx512_var_shift_w_lowering<avx512vl_i16_info, sra>; + avx512_var_shift_w<0x11, "vpsravw", sra>; defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>, - avx512_var_shift_w_lowering<avx512vl_i16_info, srl>; + avx512_var_shift_w<0x10, "vpsrlvw", srl>; + defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; +defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>; + // Special handing for handling VPSRAV intrinsics. multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _, list<Predicate> p> { @@ -5013,7 +5008,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _, def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), (!cast<Instruction>(InstrStr#_.ZSuffix##rm) _.RC:$src1, addr:$src2)>; - let AddedComplexity = 20 in { def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)), (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0, @@ -5023,8 +5017,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _, _.RC:$src0)), (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, addr:$src2)>; - } - let AddedComplexity = 30 in { def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, @@ -5034,7 +5026,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _, _.ImmAllZerosV)), (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, _.RC:$src1, addr:$src2)>; - } } } @@ -5046,14 +5037,12 @@ multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), (!cast<Instruction>(InstrStr#_.ZSuffix##rmb) _.RC:$src1, addr:$src2)>; - let AddedComplexity = 20 in def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86vsrav _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2))), _.RC:$src0)), (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, addr:$src2)>; - let AddedComplexity = 30 in def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86vsrav _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2))), @@ -5251,6 +5240,7 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, f64mem:$src2), !strconcat(OpcodeStr, @@ -5599,7 +5589,7 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base; defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, + (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base; defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -5625,13 +5615,13 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> { - + let ExeDomain = _.ExeDomain in { defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , // Operands for intrinsic are in 123 order to preserve passthu // semantics. (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))), (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, - (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))), + _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))), (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, @@ -5641,8 +5631,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds3 _.RC:$src2, - (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), + (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3, _.RC:$src1, (i32 FROUND_CURRENT))), (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), @@ -5653,8 +5642,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds1 _.RC:$src1, - (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), + (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3, _.RC:$src2, (i32 FROUND_CURRENT))), (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), @@ -5662,6 +5650,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, _.FRC:$src2))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>; + } } multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, @@ -5692,6 +5681,7 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, let Constraints = "$src1 = $dst" in { multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", @@ -5711,6 +5701,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, AVX512FMA3Base, EVEX_B; + } } } // Constraints = "$src1 = $dst" @@ -5878,10 +5869,10 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT , !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>, EVEX, VEX_LIG, EVEX_B, EVEX_RC; - def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src), + def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode - (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))), + (SrcVT.VT SrcVT.ScalarIntMemCPat:$src), (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; } // Predicates = [HasAVX512] @@ -5918,20 +5909,20 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))), (VCVTSS2SIZrr VR128X:$src)>; - def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))), - (VCVTSS2SIZrm addr:$src)>; + def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)), + (VCVTSS2SIZrm sse_load_f32:$src)>; def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))), (VCVTSS2SI64Zrr VR128X:$src)>; - def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))), - (VCVTSS2SI64Zrm addr:$src)>; + def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)), + (VCVTSS2SI64Zrm sse_load_f32:$src)>; def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))), (VCVTSD2SIZrr VR128X:$src)>; - def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))), - (VCVTSD2SIZrm addr:$src)>; + def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)), + (VCVTSD2SIZrm sse_load_f64:$src)>; def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))), (VCVTSD2SI64Zrr VR128X:$src)>; - def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))), - (VCVTSD2SI64Zrm addr:$src)>; + def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)), + (VCVTSD2SI64Zrm sse_load_f64:$src)>; } // HasAVX512 let Predicates = [HasAVX512] in { @@ -6018,7 +6009,7 @@ let Predicates = [HasAVX512] in { EVEX,VEX_LIG , EVEX_B; let mayLoad = 1, hasSideEffects = 0 in def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), - (ins _SrcRC.MemOp:$src), + (ins _SrcRC.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG; @@ -6055,47 +6046,58 @@ defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), (VCVTTSS2SIZrr_Int VR128X:$src)>; - def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))), - (VCVTTSS2SIZrm_Int addr:$src)>; + def : Pat<(i32 (int_x86_sse_cvttss2si sse_load_f32:$src)), + (VCVTTSS2SIZrm_Int ssmem:$src)>; def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), (VCVTTSS2SI64Zrr_Int VR128X:$src)>; - def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))), - (VCVTTSS2SI64Zrm_Int addr:$src)>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 sse_load_f32:$src)), + (VCVTTSS2SI64Zrm_Int ssmem:$src)>; def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), (VCVTTSD2SIZrr_Int VR128X:$src)>; - def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))), - (VCVTTSD2SIZrm_Int addr:$src)>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si sse_load_f64:$src)), + (VCVTTSD2SIZrm_Int sdmem:$src)>; def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), (VCVTTSD2SI64Zrr_Int VR128X:$src)>; - def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))), - (VCVTTSD2SI64Zrm_Int addr:$src)>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)), + (VCVTTSD2SI64Zrm_Int sdmem:$src)>; } // HasAVX512 //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode> { - defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_Src.VT _Src.RC:$src2), (i32 FROUND_CURRENT)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; - defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), - (_Src.VT (scalar_to_vector - (_Src.ScalarLdFrag addr:$src2))), + (_Src.VT _Src.ScalarIntMemCPat:$src2), (i32 FROUND_CURRENT)))>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + + let isCodeGenOnly = 1, hasSideEffects = 0 in { + def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _Src.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + let mayLoad = 1 in + def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _Src.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + } } // Scalar Coversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeRnd> { - defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", (_.VT (OpNodeRnd (_.VT _.RC:$src1), @@ -6107,7 +6109,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn // Scalar Conversion with rounding control (RC) multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeRnd> { - defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (_.VT (OpNodeRnd (_.VT _.RC:$src1), @@ -6140,39 +6142,36 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpextRnd,f32x_info, f64x_info >; def : Pat<(f64 (fpextend FR32X:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), - (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>, Requires<[HasAVX512]>; def : Pat<(f64 (fpextend (loadf32 addr:$src))), - (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; def : Pat<(f64 (extloadf32 addr:$src)), - (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, Requires<[HasAVX512, OptForSpeed]>; def : Pat<(f32 (fpround FR64X:$src)), - (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), - (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, + (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>, Requires<[HasAVX512]>; def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), - (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>, + (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>, Requires<[HasAVX512]>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), - (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>, + (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>, Requires<[HasAVX512]>; //===----------------------------------------------------------------------===// @@ -6808,7 +6807,7 @@ let Predicates = [HasAVX512] in { let Predicates = [HasVLX] in { defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>, EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; - defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>, + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; } } @@ -6917,7 +6916,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { - let AddedComplexity = 20 , Predicates = [HasAVX512] in { + let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -6942,6 +6941,7 @@ defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD; @@ -6955,6 +6955,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.FloatVT (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, EVEX, T8PD, EVEX_B; + } } multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { @@ -6986,7 +6987,7 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode> { - + let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -7005,6 +7006,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (OpNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>; + } } multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> { @@ -7024,7 +7026,7 @@ defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V; multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode> { - + let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>; @@ -7041,9 +7043,11 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, (OpNode (_.FloatVT (X86VBroadcast (_.ScalarLdFrag addr:$src))), (i32 FROUND_CURRENT))>, EVEX_B; + } } multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode> { + let ExeDomain = _.ExeDomain in defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", @@ -7084,6 +7088,7 @@ defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>, multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>, @@ -7092,6 +7097,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", (_.FloatVT (OpNode _.RC:$src))>, EVEX; @@ -7106,6 +7112,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, (OpNode (_.FloatVT (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, EVEX, EVEX_B; + } } multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, @@ -7143,7 +7150,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, string SUFF, SDNode OpNode, SDNode OpNodeRnd> { - + let ExeDomain = _.ExeDomain in { defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", @@ -7176,6 +7183,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; } + } def : Pat<(_.EltVT (OpNode _.FRC:$src)), (!cast<Instruction>(NAME#SUFF#Zr) @@ -7480,11 +7488,11 @@ multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, } multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasBWI] in { defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info, - v16i8x_info, i64mem, LdFrag, OpNode>, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128; defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info, @@ -7499,11 +7507,11 @@ multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, } multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, - v16i8x_info, i32mem, LdFrag, OpNode>, + v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128; defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, @@ -7518,11 +7526,11 @@ multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, } multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, - v16i8x_info, i16mem, LdFrag, OpNode>, + v16i8x_info, i16mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128; defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, @@ -7537,11 +7545,11 @@ multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, } multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, - v8i16x_info, i64mem, LdFrag, OpNode>, + v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128; defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, @@ -7556,11 +7564,11 @@ multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, } multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, - v8i16x_info, i32mem, LdFrag, OpNode>, + v8i16x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128; defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, @@ -7575,12 +7583,12 @@ multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, } multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, - v4i32x_info, i64mem, LdFrag, OpNode>, + v4i32x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, @@ -7594,19 +7602,19 @@ multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, } } -defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">; -defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">; -defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">; -defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">; -defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">; -defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">; +defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">; +defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">; +defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">; +defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">; +defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">; +defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">; -defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">; -defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">; -defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">; -defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">; -defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">; -defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">; +defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">; +defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">; +defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">; +defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">; +defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">; +defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">; // EXTLOAD patterns, implemented using vpmovz multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To, @@ -7649,69 +7657,69 @@ let Predicates = [HasAVX512] in { defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>; } -multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy, - SDNode ExtOp, PatFrag ExtLoad16> { +multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, + SDNode InVecOp, PatFrag ExtLoad16> { // 128-bit patterns let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { - def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))), (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))), (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; } // 256-bit patterns @@ -7790,8 +7798,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy, } } -defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; -defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; +defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>; +defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>; //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations @@ -7832,7 +7840,7 @@ multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem, mgatherv16i32>, EVEX_V512; - defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz512mem, + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256xmem, mgatherv8i64>, EVEX_V512; let Predicates = [HasVLX] in { defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, @@ -7889,7 +7897,7 @@ multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem, mscatterv16i32>, EVEX_V512; - defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz512mem, + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256xmem, mscatterv8i64>, EVEX_V512; let Predicates = [HasVLX] in { defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, @@ -7922,7 +7930,7 @@ defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps", - VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; @@ -7934,7 +7942,7 @@ defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps", - VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd", VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; @@ -7946,7 +7954,7 @@ defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps", - VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd", VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; @@ -7958,7 +7966,7 @@ defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps", - VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd", VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; @@ -7982,6 +7990,17 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; } +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_convert_mask_to_vector_lowering<X86VectorVTInfo X86Info, + X86VectorVTInfo _> { + + def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))), + (X86Info.VT (EXTRACT_SUBREG + (_.VT (!cast<Instruction>(NAME#"Zrr") + (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))), + X86Info.SubRegIdx))>; +} + multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { let Predicates = [prd] in @@ -7991,20 +8010,17 @@ let Predicates = [prd] in defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; } -} +let Predicates = [prd, NoVLX] in { + defm Z256_Alt : avx512_convert_mask_to_vector_lowering<VTInfo.info256,VTInfo.info512>; + defm Z128_Alt : avx512_convert_mask_to_vector_lowering<VTInfo.info128,VTInfo.info512>; + } -multiclass avx512_convert_mask_to_vector<string OpcodeStr> { - defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, - HasBWI>; - defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, - HasBWI>, VEX_W; - defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, - HasDQI>; - defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, - HasDQI>, VEX_W; } -defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; +defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>; +defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W; +defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>; +defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W; multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), @@ -8319,6 +8335,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", @@ -8466,6 +8483,39 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +let Predicates = [HasAVX512] in { +// Provide fallback in case the load node that is used in the broadcast +// patterns above is used by additional users, which prevents the pattern +// selection. +def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +} + multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> { defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>, AVX512AIi8Base, EVEX_4V; @@ -8503,6 +8553,7 @@ defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", @@ -8513,6 +8564,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src1", "$src1", (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; + } } multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -8577,66 +8629,7 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, HasBWI>; } -defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>; - -def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), - VR128X:$src))>; -def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>; -def avx512_v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>; -def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), - VR256X:$src))>; -def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>; -def avx512_v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>; - -let Predicates = [HasBWI, HasVLX] in { - def : Pat<(xor - (bc_v2i64 (avx512_v16i1sextv16i8)), - (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))), - (VPABSBZ128rr VR128X:$src)>; - def : Pat<(xor - (bc_v2i64 (avx512_v8i1sextv8i16)), - (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))), - (VPABSWZ128rr VR128X:$src)>; - def : Pat<(xor - (bc_v4i64 (avx512_v32i1sextv32i8)), - (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))), - (VPABSBZ256rr VR256X:$src)>; - def : Pat<(xor - (bc_v4i64 (avx512_v16i1sextv16i16)), - (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))), - (VPABSWZ256rr VR256X:$src)>; -} -let Predicates = [HasAVX512, HasVLX] in { - def : Pat<(xor - (bc_v2i64 (avx512_v4i1sextv4i32)), - (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))), - (VPABSDZ128rr VR128X:$src)>; - def : Pat<(xor - (bc_v4i64 (avx512_v8i1sextv8i32)), - (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))), - (VPABSDZ256rr VR256X:$src)>; -} - -let Predicates = [HasAVX512] in { -def : Pat<(xor - (bc_v8i64 (v16i1sextv16i32)), - (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))), - (VPABSDZrr VR512:$src)>; -def : Pat<(xor - (bc_v8i64 (v8i1sextv8i64)), - (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), - (VPABSQZrr VR512:$src)>; -} -let Predicates = [HasBWI] in { -def : Pat<(xor - (bc_v8i64 (v64i1sextv64i8)), - (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))), - (VPABSBZrr VR512:$src)>; -def : Pat<(xor - (bc_v8i64 (v32i1sextv32i16)), - (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))), - (VPABSWZrr VR512:$src)>; -} +defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>; multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ @@ -8663,6 +8656,7 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX; @@ -8671,6 +8665,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src)))))>, EVEX, EVEX_CD8<_.EltSize, CD8VH>; + } } multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -8947,6 +8942,68 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", HasBWI>, EVEX_4V; +// Transforms to swizzle an immediate to enable better matching when +// memory operand isn't in the right place. +def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2. + uint8_t Imm = N->getZExtValue(); + // Swap bits 1/4 and 3/6. + uint8_t NewImm = Imm & 0xa5; + if (Imm & 0x02) NewImm |= 0x10; + if (Imm & 0x10) NewImm |= 0x02; + if (Imm & 0x08) NewImm |= 0x40; + if (Imm & 0x40) NewImm |= 0x08; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. + uint8_t Imm = N->getZExtValue(); + // Swap bits 2/4 and 3/5. + uint8_t NewImm = Imm & 0xc3; + if (Imm & 0x04) NewImm |= 0x10; + if (Imm & 0x10) NewImm |= 0x04; + if (Imm & 0x08) NewImm |= 0x20; + if (Imm & 0x20) NewImm |= 0x08; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. + uint8_t Imm = N->getZExtValue(); + // Swap bits 1/2 and 5/6. + uint8_t NewImm = Imm & 0x99; + if (Imm & 0x02) NewImm |= 0x04; + if (Imm & 0x04) NewImm |= 0x02; + if (Imm & 0x20) NewImm |= 0x40; + if (Imm & 0x40) NewImm |= 0x20; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by moving operand 1 to the end. + uint8_t Imm = N->getZExtValue(); + // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5 + uint8_t NewImm = Imm & 0x81; + if (Imm & 0x02) NewImm |= 0x04; + if (Imm & 0x04) NewImm |= 0x10; + if (Imm & 0x08) NewImm |= 0x40; + if (Imm & 0x10) NewImm |= 0x02; + if (Imm & 0x20) NewImm |= 0x08; + if (Imm & 0x40) NewImm |= 0x20; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by moving operand 2 to the beginning. + uint8_t Imm = N->getZExtValue(); + // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3 + uint8_t NewImm = Imm & 0x81; + if (Imm & 0x02) NewImm |= 0x10; + if (Imm & 0x04) NewImm |= 0x02; + if (Imm & 0x08) NewImm |= 0x20; + if (Imm & 0x10) NewImm |= 0x04; + if (Imm & 0x20) NewImm |= 0x40; + if (Imm & 0x40) NewImm |= 0x08; + return getI8Imm(NewImm, SDLoc(N)); +}]>; + multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { @@ -8975,6 +9032,141 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (i8 imm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; }// Constraints = "$src1 = $dst" + + // Additional patterns for matching passthru operand in other positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + + // Additional patterns for matching loads in other positions. + def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4))), + (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching zero masking with loads in other + // positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching masked loads with different + // operand orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, _.RC:$src1, + (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; + + // Additional patterns for matching broadcasts in other positions. + def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4))), + (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching zero masking with broadcasts in other + // positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3, + (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3, + (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching masked broadcasts with different + // operand orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (i8 imm:$src4)), _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; } multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{ diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index ba970bc2048e1..dcce7b9951f26 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -147,7 +147,7 @@ addOffset(const MachineInstrBuilder &MIB, int Offset) { static inline const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) { - return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0); + return MIB.addImm(1).addReg(0).add(Offset).addReg(0); } /// addRegOffset - This function is used to add a memory reference of the form diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index c73c95019f8d2..b85abfb9ca7f3 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -110,3 +110,9 @@ defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than +// SALC is an undocumented instruction. Information for this instruction can be found +// here http://www.rcollins.org/secrets/opcodes/SALC.html +// Set AL if carry. +let Uses = [EFLAGS], Defs = [AL] in { + def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>; +} diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 3c27eb8077d0f..e592c2b3c0aa5 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -259,20 +259,20 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, AddedComplexity = 20 in + isPseudo = 1, AddedComplexity = 10 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; // Other widths can also make use of the 32-bit xor, which may have a smaller // encoding and avoid partial register updates. +let AddedComplexity = 10 in { def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; -def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { - let AddedComplexity = 20; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>; } let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], - AddedComplexity = 15 in { + AddedComplexity = 10 in { // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, // which only require 3 bytes compared to MOV32ri which requires 5. let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { @@ -287,7 +287,7 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; } -let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in { +let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5 in { // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. // FIXME: Add itinerary class and Schedule. def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", @@ -772,11 +772,11 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", // the pseudo. The argument feeding EBX is ebx_input. // // The additional argument, $ebx_save, is a temporary register used to -// save the value of RBX accross the actual instruction. +// save the value of RBX across the actual instruction. // // To make sure the register assigned to $ebx_save does not interfere with // the definition of the actual instruction, we use a definition $dst which -// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross +// is tied to $rbx_save. That way, the live-range of $rbx_save spans across // the instruction and we are sure we will have a valid register to restore // the value of RBX. let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX], @@ -1743,6 +1743,12 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; +// sub reg, relocImm +def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2), + (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>; +def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2), + (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>; + // mul reg, reg def : Pat<(mul GR16:$src1, GR16:$src2), (IMUL16rr GR16:$src1, GR16:$src2)>; diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 2f260c48df474..4ea223e82be9c 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -264,6 +264,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>; } +// Conditional tail calls are similar to the above, but they are branches +// rather than barriers, and they use EFLAGS. +let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [ESP, EFLAGS] in { + def TCRETURNdicc : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; + + // This gets substituted to a conditional jump instruction in MC lowering. + def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs), + (ins i32imm_pcrel:$dst, i32imm:$cond), + "", + [], IIC_JMP_REL>; +} + //===----------------------------------------------------------------------===// // Call Instructions... @@ -325,3 +340,19 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>; } } + +// Conditional tail calls are similar to the above, but they are branches +// rather than barriers, and they use EFLAGS. +let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [RSP, EFLAGS] in { + def TCRETURNdi64cc : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset, + i32imm:$cond), []>; + + // This gets substituted to a conditional jump instruction in MC lowering. + def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst, i32imm:$cond), + "", + [], IIC_JMP_REL>; +} diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 4b19f801dae1d..1941ae57f0f1f 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -191,13 +191,15 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, string PackTy, string Suff, SDNode OpNode, RegisterClass RC, - X86MemOperand x86memop> { - defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), - x86memop, RC>; - defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), - x86memop, RC, OpNode>; - defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), - x86memop, RC>; + X86MemOperand x86memop> { + let Predicates = [HasFMA, NoAVX512] in { + defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), + x86memop, RC>; + defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), + x86memop, RC, OpNode>; + defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), + x86memop, RC>; + } } // The FMA 213 form is created for lowering of scalar FMA intrinscis diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp index db83497ee69df..00ef65cdb6bd7 100644 --- a/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/lib/Target/X86/X86InstrFMA3Info.cpp @@ -16,11 +16,14 @@ #include "X86InstrInfo.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Threading.h" +#include <cassert> +#include <cstdint> + using namespace llvm; /// This flag is used in the method llvm::call_once() used below to make the /// initialization of the map 'OpcodeToGroup' thread safe. -LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag); +static llvm::once_flag InitGroupsOnceFlag; static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj; X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() { diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h index 025cee3b2b909..e3568160da469 100644 --- a/lib/Target/X86/X86InstrFMA3Info.h +++ b/lib/Target/X86/X86InstrFMA3Info.h @@ -1,4 +1,4 @@ -//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===// +//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,9 +18,11 @@ #include "X86.h" #include "llvm/ADT/DenseMap.h" #include <cassert> +#include <cstdint> #include <set> namespace llvm { + /// This class is used to group {132, 213, 231} forms of FMA opcodes together. /// Each of the groups has either 3 register opcodes, 3 memory opcodes, /// or 6 register and memory opcodes. Also, each group has an attrubutes field @@ -201,7 +203,7 @@ public: static X86InstrFMA3Info *getX86InstrFMA3Info(); /// Constructor. Just creates an object of the class. - X86InstrFMA3Info() {} + X86InstrFMA3Info() = default; /// Destructor. Deallocates the memory used for FMA3 Groups. ~X86InstrFMA3Info() { @@ -310,6 +312,7 @@ public: return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end()); } }; -} // namespace llvm -#endif +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 10f3839ea8ed0..11b1d070ef2f9 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -631,6 +631,9 @@ let Defs = [FPSW] in def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>; def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg", IIC_FFREE>; +def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), + "ffreep\t$reg", IIC_FFREE>; + // Clear exceptions let Defs = [FPSW] in @@ -665,15 +668,16 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; let Predicates = [HasFXSR] in { def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], - IIC_FXSAVE>, TB, Requires<[In64BitMode]>; + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, + TB; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], - IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; } // Predicates = [FeatureFXSR] } // SchedRW diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 610756aa37da7..c2fe786732dcd 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -199,7 +199,8 @@ class TAPS : TA { Prefix OpPrefix = PS; } class TAPD : TA { Prefix OpPrefix = PD; } class TAXD : TA { Prefix OpPrefix = XD; } class VEX { Encoding OpEnc = EncVEX; } -class VEX_W { bit hasVEX_WPrefix = 1; } +class VEX_W { bits<2> VEX_WPrefix = 1; } +class VEX_WIG { bits<2> VEX_WPrefix = 2; } class VEX_4V : VEX { bit hasVEX_4V = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } @@ -270,7 +271,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasREPPrefix = 0; // Does this inst have a REP prefix? Encoding OpEnc = EncNormal; // Encoding used by this instruction bits<2> OpEncBits = OpEnc.Value; - bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? + bits<2> VEX_WPrefix = 0; // Does this inst set the VEX_W field? bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit @@ -317,7 +318,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{28-27} = ExeDomain.Value; let TSFlags{30-29} = OpEncBits; let TSFlags{38-31} = Opcode; - let TSFlags{39} = hasVEX_WPrefix; + // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0. + let TSFlags{39} = VEX_WPrefix{0}; let TSFlags{40} = hasVEX_4V; let TSFlags{41} = hasVEX_L; let TSFlags{42} = hasEVEX_K; @@ -453,7 +455,7 @@ class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm, Domain d = GenericDomain> : I<o, F, outs, ins, asm, pattern, itin, d> { let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], - !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2], !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index c5689d7c698cc..9867ba84bb9ba 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -27,21 +27,19 @@ def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, //===----------------------------------------------------------------------===// def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; -def load_mvmmx : PatFrag<(ops node:$ptr), - (x86mmx (MMX_X86movw2d (load node:$ptr)))>; //===----------------------------------------------------------------------===// // SSE specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, - SDTCisFP<1>, SDTCisVT<3, i8>, - SDTCisVec<1>]>; -def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, - SDTCisSameAs<1, 2>, SDTCisInt<3>]>; +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; +def X86fmins : SDNode<"X86ISD::FMINS", SDTFPBinOp>; +def X86fmaxs : SDNode<"X86ISD::FMAXS", SDTFPBinOp>; // Commutative and Associative FMIN and FMAX. def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp, @@ -200,6 +198,15 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86kshiftl : SDNode<"X86ISD::KSHIFTL", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i8>]>>; +def X86kshiftr : SDNode<"X86ISD::KSHIFTR", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i8>]>>; + def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>; def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>; @@ -230,10 +237,11 @@ def X86vpermil2 : SDNode<"X86ISD::VPERMIL2", SDTCisSameAs<0,2>, SDTCisSameSizeAs<0,3>, SDTCisSameNumEltsAs<0, 3>, + SDTCisFP<0>, SDTCisInt<3>, SDTCisVT<4, i8>]>>; def X86vpperm : SDNode<"X86ISD::VPPERM", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; + SDTCisSameAs<0,2>, SDTCisSameAs<0, 3>]>>; def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, @@ -300,13 +308,17 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameSizeAs<0,2>, - SDTCisSameNumEltsAs<0,2>]>; + SDTCisSameNumEltsAs<0,2>, + SDTCisFP<0>, SDTCisInt<2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; -def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; +def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>, @@ -314,8 +326,10 @@ def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisSameNumEltsAs<0, 3>, SDTCisVT<4, i32>, SDTCisVT<5, i32>]>; -def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i32>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, @@ -324,9 +338,9 @@ def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; -def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, - SDTCisVT<4, i8>]>; +def SDTTernlog : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>, SDTCisVT<4, i8>]>; def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>; @@ -334,16 +348,13 @@ def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [ // fsqrt_round, fgetexp_round, etc. SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>; -def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, - SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, - SDTCisVT<4, i32>]>; + SDTCisFP<0>, SDTCisVT<4, i32>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; -def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; @@ -367,17 +378,28 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; -def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>, + SDTCisVec<1>, SDTCisInt<1>, SDTCisSameSizeAs<0,1>, - SDTCisSameAs<1,2>]>; + SDTCisSameAs<1,2>, + SDTCisOpSmallerThanOp<0, 1>]>; def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; -def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; -def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack, [SDNPCommutative]>; +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i32>, + SDTCVecEltisVT<1, i16>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>, + [SDNPCommutative]>; def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; @@ -414,8 +436,8 @@ def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>; def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>; def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>; def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, - SDTCisVec<1>, SDTCisFP<1>, + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisFP<1>, SDTCisSameNumEltsAs<0,1>, SDTCisVT<2, i32>]>, []>; def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", @@ -428,9 +450,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; -def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, - SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; @@ -440,24 +459,30 @@ def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86fmaxRnds : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fminRnds : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>; def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>; def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>; -def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; -def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; -def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; -def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; -def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; -def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFPTernaryOp>; +def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp>; +def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp>; +def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp>; def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>; def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>; @@ -478,8 +503,10 @@ def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>; def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>; def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>; -def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTFma>; -def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTFma>; +def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma>; +def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma>; def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 627b6120b048b..7b456fd68343f 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -414,17 +414,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE }, { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE }, { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE }, - { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, - { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, - { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, + { X86::VMOVPQIto64Zrr, X86::VMOVPQI2QIZmr, TB_FOLDED_STORE }, + { X86::VMOVSDto64Zrr, X86::VMOVSDto64Zmr, TB_FOLDED_STORE }, + { X86::VMOVSS2DIZrr, X86::VMOVSS2DIZmr, TB_FOLDED_STORE }, + { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, + { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, + { X86::VPEXTRDZrr, X86::VPEXTRDZmr, TB_FOLDED_STORE }, + { X86::VPEXTRQZrr, X86::VPEXTRQZmr, TB_FOLDED_STORE }, { X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE }, { X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE }, { X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE }, @@ -867,11 +872,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE }, { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, - { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, + { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 }, + { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 }, { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, @@ -883,8 +887,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, + { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE }, + { X86::VPABSBZrr, X86::VPABSBZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + { X86::VPABSWZrr, X86::VPABSWZrm, 0 }, { X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 }, { X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 }, { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, @@ -904,12 +911,21 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, + { X86::VPSLLDQZ512rr, X86::VPSLLDQZ512rm, 0 }, + { X86::VPSLLDZri, X86::VPSLLDZmi, 0 }, + { X86::VPSLLQZri, X86::VPSLLQZmi, 0 }, + { X86::VPSLLWZri, X86::VPSLLWZmi, 0 }, + { X86::VPSRADZri, X86::VPSRADZmi, 0 }, + { X86::VPSRAQZri, X86::VPSRAQZmi, 0 }, + { X86::VPSRAWZri, X86::VPSRAWZmi, 0 }, + { X86::VPSRLDQZ512rr, X86::VPSRLDQZ512rm, 0 }, + { X86::VPSRLDZri, X86::VPSRLDZmi, 0 }, + { X86::VPSRLQZri, X86::VPSRLQZmi, 0 }, + { X86::VPSRLWZri, X86::VPSRLWZmi, 0 }, // AVX-512 foldable instructions (256-bit versions) { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, @@ -920,6 +936,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + { X86::VPABSBZ256rr, X86::VPABSBZ256rm, 0 }, + { X86::VPABSDZ256rr, X86::VPABSDZ256rm, 0 }, + { X86::VPABSQZ256rr, X86::VPABSQZ256rm, 0 }, + { X86::VPABSWZ256rr, X86::VPABSWZ256rm, 0 }, { X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 }, { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 }, { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 }, @@ -939,10 +959,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 }, { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 }, { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, + { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 }, + { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 }, + { X86::VPSLLQZ256ri, X86::VPSLLQZ256mi, 0 }, + { X86::VPSLLWZ256ri, X86::VPSLLWZ256mi, 0 }, + { X86::VPSRADZ256ri, X86::VPSRADZ256mi, 0 }, + { X86::VPSRAQZ256ri, X86::VPSRAQZ256mi, 0 }, + { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 }, + { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 }, + { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 }, + { X86::VPSRLQZ256ri, X86::VPSRLQZ256mi, 0 }, + { X86::VPSRLWZ256ri, X86::VPSRLWZ256mi, 0 }, // AVX-512 foldable instructions (128-bit versions) { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, @@ -953,6 +983,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, + { X86::VPABSBZ128rr, X86::VPABSBZ128rm, 0 }, + { X86::VPABSDZ128rr, X86::VPABSDZ128rm, 0 }, + { X86::VPABSQZ128rr, X86::VPABSQZ128rm, 0 }, + { X86::VPABSWZ128rr, X86::VPABSWZ128rm, 0 }, { X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 }, { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 }, { X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE }, @@ -970,6 +1004,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 }, { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 }, { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 }, + { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 }, + { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 }, + { X86::VPSLLQZ128ri, X86::VPSLLQZ128mi, 0 }, + { X86::VPSLLWZ128ri, X86::VPSLLWZ128mi, 0 }, + { X86::VPSRADZ128ri, X86::VPSRADZ128mi, 0 }, + { X86::VPSRAQZ128ri, X86::VPSRAQZ128mi, 0 }, + { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 }, + { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 }, + { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 }, + { X86::VPSRLQZ128ri, X86::VPSRLQZ128mi, 0 }, + { X86::VPSRLWZ128ri, X86::VPSRLWZ128mi, 0 }, // F16C foldable instructions { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, @@ -1170,18 +1215,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PINSRWrri, X86::PINSRWrmi, 0 }, { X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, + { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, + { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, - { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, - { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, + { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, + { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, + { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, + { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, - { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, - { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, - { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, - { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, { X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 }, { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, @@ -1340,8 +1385,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PMULHRWrr, X86::PMULHRWrm, 0 }, // AVX 128-bit versions of foldable instructions - { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, - { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE }, { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, @@ -1350,8 +1393,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, - { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, - { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, @@ -1458,18 +1499,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, { X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, + { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, + { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, - { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, - { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, + { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, + { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, + { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, + { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, - { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, - { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, - { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, - { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, { X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 }, { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, @@ -1626,18 +1667,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, { X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 }, { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, + { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, + { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, - { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, - { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, + { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, + { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, + { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, + { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, - { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, - { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, - { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, - { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, { X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 }, @@ -1732,7 +1773,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, - { X86::VPCMOVrrrY, X86::VPCMOVrmrY, 0 }, + { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 }, { X86::VPCOMBri, X86::VPCOMBmi, 0 }, { X86::VPCOMDri, X86::VPCOMDmi, 0 }, { X86::VPCOMQri, X86::VPCOMQmi, 0 }, @@ -1742,9 +1783,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPCOMUQri, X86::VPCOMUQmi, 0 }, { X86::VPCOMUWri, X86::VPCOMUWmi, 0 }, { X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 }, - { X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 }, + { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYmr, 0 }, { X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 }, - { X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 }, + { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYmr, 0 }, { X86::VPMACSDDrr, X86::VPMACSDDrm, 0 }, { X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 }, { X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 }, @@ -1800,8 +1841,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VANDNPSZrr, X86::VANDNPSZrm, 0 }, { X86::VANDPDZrr, X86::VANDPDZrm, 0 }, { X86::VANDPSZrr, X86::VANDPSZrm, 0 }, - { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, { X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 }, { X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 }, { X86::VCMPSDZrr, X86::VCMPSDZrm, 0 }, @@ -1842,6 +1881,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE }, { X86::VMINSSZrr, X86::VMINSSZrm, 0 }, { X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE }, + { X86::VMOVLHPSZrr, X86::VMOVHPSZ128rm, TB_NO_REVERSE }, { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, @@ -1850,6 +1890,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE }, { X86::VORPDZrr, X86::VORPDZrm, 0 }, { X86::VORPSZrr, X86::VORPSZrm, 0 }, + { X86::VPACKSSDWZrr, X86::VPACKSSDWZrm, 0 }, + { X86::VPACKSSWBZrr, X86::VPACKSSWBZrm, 0 }, + { X86::VPACKUSDWZrr, X86::VPACKUSDWZrm, 0 }, + { X86::VPACKUSWBZrr, X86::VPACKUSWBZrm, 0 }, { X86::VPADDBZrr, X86::VPADDBZrm, 0 }, { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, @@ -1863,6 +1907,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZrr, X86::VPANDNDZrm, 0 }, { X86::VPANDNQZrr, X86::VPANDNQZrm, 0 }, { X86::VPANDQZrr, X86::VPANDQZrm, 0 }, + { X86::VPAVGBZrr, X86::VPAVGBZrm, 0 }, + { X86::VPAVGWZrr, X86::VPAVGWZrm, 0 }, { X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 }, { X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 }, { X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 }, @@ -1887,26 +1933,55 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, { X86::VPERMQZrr, X86::VPERMQZrm, 0 }, { X86::VPERMWZrr, X86::VPERMWZrm, 0 }, + { X86::VPINSRBZrr, X86::VPINSRBZrm, 0 }, + { X86::VPINSRDZrr, X86::VPINSRDZrm, 0 }, + { X86::VPINSRQZrr, X86::VPINSRQZrm, 0 }, + { X86::VPINSRWZrr, X86::VPINSRWZrm, 0 }, { X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 }, { X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 }, + { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 }, { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, + { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 }, + { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 }, { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 }, + { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 }, + { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 }, { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 }, { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 }, + { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 }, + { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 }, { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, + { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 }, { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, + { X86::VPMULLDZrr, X86::VPMULLDZrm, 0 }, + { X86::VPMULLQZrr, X86::VPMULLQZrm, 0 }, + { X86::VPMULLWZrr, X86::VPMULLWZrm, 0 }, { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, { X86::VPORDZrr, X86::VPORDZrm, 0 }, { X86::VPORQZrr, X86::VPORQZrm, 0 }, + { X86::VPSADBWZ512rr, X86::VPSADBWZ512rm, 0 }, { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, + { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 }, + { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 }, { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, + { X86::VPSLLVWZrr, X86::VPSLLVWZrm, 0 }, + { X86::VPSLLWZrr, X86::VPSLLWZrm, 0 }, + { X86::VPSRADZrr, X86::VPSRADZrm, 0 }, + { X86::VPSRAQZrr, X86::VPSRAQZrm, 0 }, { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, + { X86::VPSRAVQZrr, X86::VPSRAVQZrm, 0 }, + { X86::VPSRAVWZrr, X86::VPSRAVWZrm, 0 }, + { X86::VPSRAWZrr, X86::VPSRAWZrm, 0 }, + { X86::VPSRLDZrr, X86::VPSRLDZrm, 0 }, + { X86::VPSRLQZrr, X86::VPSRLQZrm, 0 }, { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, + { X86::VPSRLVWZrr, X86::VPSRLVWZrm, 0 }, + { X86::VPSRLWZrr, X86::VPSRLWZrm, 0 }, { X86::VPSUBBZrr, X86::VPSUBBZrm, 0 }, { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, @@ -1957,9 +2032,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 }, { X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 }, { X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 }, - { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 }, { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 }, @@ -1996,6 +2068,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VORPDZ256rr, X86::VORPDZ256rm, 0 }, { X86::VORPSZ128rr, X86::VORPSZ128rm, 0 }, { X86::VORPSZ256rr, X86::VORPSZ256rm, 0 }, + { X86::VPACKSSDWZ256rr, X86::VPACKSSDWZ256rm, 0 }, + { X86::VPACKSSDWZ128rr, X86::VPACKSSDWZ128rm, 0 }, + { X86::VPACKSSWBZ256rr, X86::VPACKSSWBZ256rm, 0 }, + { X86::VPACKSSWBZ128rr, X86::VPACKSSWBZ128rm, 0 }, + { X86::VPACKUSDWZ256rr, X86::VPACKUSDWZ256rm, 0 }, + { X86::VPACKUSDWZ128rr, X86::VPACKUSDWZ128rm, 0 }, + { X86::VPACKUSWBZ256rr, X86::VPACKUSWBZ256rm, 0 }, + { X86::VPACKUSWBZ128rr, X86::VPACKUSWBZ128rm, 0 }, { X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 }, { X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 }, { X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 }, @@ -2022,6 +2102,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 }, { X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 }, { X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 }, + { X86::VPAVGBZ128rr, X86::VPAVGBZ128rm, 0 }, + { X86::VPAVGBZ256rr, X86::VPAVGBZ256rm, 0 }, + { X86::VPAVGWZ128rr, X86::VPAVGWZ128rm, 0 }, + { X86::VPAVGWZ256rr, X86::VPAVGWZ256rm, 0 }, { X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 }, { X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 }, { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 }, @@ -2070,12 +2154,92 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 }, { X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 }, { X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 }, + { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 }, + { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 }, + { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 }, + { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 }, + { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 }, + { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 }, + { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 }, + { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 }, + { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 }, + { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 }, + { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 }, + { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 }, + { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 }, + { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 }, + { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 }, + { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 }, + { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 }, + { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 }, + { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 }, + { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 }, + { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 }, + { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 }, + { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 }, + { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 }, + { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 }, + { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 }, + { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 }, + { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 }, + { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 }, + { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 }, + { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 }, + { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 }, + { X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 }, + { X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 }, + { X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 }, + { X86::VPMULLDZ256rr, X86::VPMULLDZ256rm, 0 }, + { X86::VPMULLQZ128rr, X86::VPMULLQZ128rm, 0 }, + { X86::VPMULLQZ256rr, X86::VPMULLQZ256rm, 0 }, + { X86::VPMULLWZ128rr, X86::VPMULLWZ128rm, 0 }, + { X86::VPMULLWZ256rr, X86::VPMULLWZ256rm, 0 }, + { X86::VPMULUDQZ128rr, X86::VPMULUDQZ128rm, 0 }, + { X86::VPMULUDQZ256rr, X86::VPMULUDQZ256rm, 0 }, { X86::VPORDZ128rr, X86::VPORDZ128rm, 0 }, { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 }, { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 }, { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 }, + { X86::VPSADBWZ128rr, X86::VPSADBWZ128rm, 0 }, + { X86::VPSADBWZ256rr, X86::VPSADBWZ256rm, 0 }, { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 }, { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 }, + { X86::VPSLLDZ128rr, X86::VPSLLDZ128rm, 0 }, + { X86::VPSLLDZ256rr, X86::VPSLLDZ256rm, 0 }, + { X86::VPSLLQZ128rr, X86::VPSLLQZ128rm, 0 }, + { X86::VPSLLQZ256rr, X86::VPSLLQZ256rm, 0 }, + { X86::VPSLLVDZ128rr, X86::VPSLLVDZ128rm, 0 }, + { X86::VPSLLVDZ256rr, X86::VPSLLVDZ256rm, 0 }, + { X86::VPSLLVQZ128rr, X86::VPSLLVQZ128rm, 0 }, + { X86::VPSLLVQZ256rr, X86::VPSLLVQZ256rm, 0 }, + { X86::VPSLLVWZ128rr, X86::VPSLLVWZ128rm, 0 }, + { X86::VPSLLVWZ256rr, X86::VPSLLVWZ256rm, 0 }, + { X86::VPSLLWZ128rr, X86::VPSLLWZ128rm, 0 }, + { X86::VPSLLWZ256rr, X86::VPSLLWZ256rm, 0 }, + { X86::VPSRADZ128rr, X86::VPSRADZ128rm, 0 }, + { X86::VPSRADZ256rr, X86::VPSRADZ256rm, 0 }, + { X86::VPSRAQZ128rr, X86::VPSRAQZ128rm, 0 }, + { X86::VPSRAQZ256rr, X86::VPSRAQZ256rm, 0 }, + { X86::VPSRAVDZ128rr, X86::VPSRAVDZ128rm, 0 }, + { X86::VPSRAVDZ256rr, X86::VPSRAVDZ256rm, 0 }, + { X86::VPSRAVQZ128rr, X86::VPSRAVQZ128rm, 0 }, + { X86::VPSRAVQZ256rr, X86::VPSRAVQZ256rm, 0 }, + { X86::VPSRAVWZ128rr, X86::VPSRAVWZ128rm, 0 }, + { X86::VPSRAVWZ256rr, X86::VPSRAVWZ256rm, 0 }, + { X86::VPSRAWZ128rr, X86::VPSRAWZ128rm, 0 }, + { X86::VPSRAWZ256rr, X86::VPSRAWZ256rm, 0 }, + { X86::VPSRLDZ128rr, X86::VPSRLDZ128rm, 0 }, + { X86::VPSRLDZ256rr, X86::VPSRLDZ256rm, 0 }, + { X86::VPSRLQZ128rr, X86::VPSRLQZ128rm, 0 }, + { X86::VPSRLQZ256rr, X86::VPSRLQZ256rm, 0 }, + { X86::VPSRLVDZ128rr, X86::VPSRLVDZ128rm, 0 }, + { X86::VPSRLVDZ256rr, X86::VPSRLVDZ256rm, 0 }, + { X86::VPSRLVQZ128rr, X86::VPSRLVQZ128rm, 0 }, + { X86::VPSRLVQZ256rr, X86::VPSRLVQZ256rm, 0 }, + { X86::VPSRLVWZ128rr, X86::VPSRLVWZ128rm, 0 }, + { X86::VPSRLVWZ256rr, X86::VPSRLVWZ256rm, 0 }, + { X86::VPSRLWZ128rr, X86::VPSRLWZ128rm, 0 }, + { X86::VPSRLWZ256rr, X86::VPSRLWZ256rm, 0 }, { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 }, { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 }, { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 }, @@ -2112,6 +2276,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 }, { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 }, { X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 }, + { X86::VSHUFPDZ128rri, X86::VSHUFPDZ128rmi, 0 }, + { X86::VSHUFPDZ256rri, X86::VSHUFPDZ256rmi, 0 }, + { X86::VSHUFPSZ128rri, X86::VSHUFPSZ128rmi, 0 }, + { X86::VSHUFPSZ256rri, X86::VSHUFPSZ256rmi, 0 }, { X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 }, { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, @@ -2130,6 +2298,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 }, // AVX-512 masked foldable instructions + { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, + { X86::VPABSBZrrkz, X86::VPABSBZrmkz, 0 }, + { X86::VPABSDZrrkz, X86::VPABSDZrmkz, 0 }, + { X86::VPABSQZrrkz, X86::VPABSQZrmkz, 0 }, + { X86::VPABSWZrrkz, X86::VPABSWZrmkz, 0 }, { X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 }, { X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 }, { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 }, @@ -2149,8 +2323,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 }, { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 }, { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, + { X86::VPSLLDZrikz, X86::VPSLLDZmikz, 0 }, + { X86::VPSLLQZrikz, X86::VPSLLQZmikz, 0 }, + { X86::VPSLLWZrikz, X86::VPSLLWZmikz, 0 }, + { X86::VPSRADZrikz, X86::VPSRADZmikz, 0 }, + { X86::VPSRAQZrikz, X86::VPSRAQZmikz, 0 }, + { X86::VPSRAWZrikz, X86::VPSRAWZmikz, 0 }, + { X86::VPSRLDZrikz, X86::VPSRLDZmikz, 0 }, + { X86::VPSRLQZrikz, X86::VPSRLQZmikz, 0 }, + { X86::VPSRLWZrikz, X86::VPSRLWZmikz, 0 }, // AVX-512VL 256-bit masked foldable instructions + { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, + { X86::VPABSBZ256rrkz, X86::VPABSBZ256rmkz, 0 }, + { X86::VPABSDZ256rrkz, X86::VPABSDZ256rmkz, 0 }, + { X86::VPABSQZ256rrkz, X86::VPABSQZ256rmkz, 0 }, + { X86::VPABSWZ256rrkz, X86::VPABSWZ256rmkz, 0 }, { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 }, { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 }, { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 }, @@ -2170,8 +2359,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 }, { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 }, { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, + { X86::VPSLLDZ256rikz, X86::VPSLLDZ256mikz, 0 }, + { X86::VPSLLQZ256rikz, X86::VPSLLQZ256mikz, 0 }, + { X86::VPSLLWZ256rikz, X86::VPSLLWZ256mikz, 0 }, + { X86::VPSRADZ256rikz, X86::VPSRADZ256mikz, 0 }, + { X86::VPSRAQZ256rikz, X86::VPSRAQZ256mikz, 0 }, + { X86::VPSRAWZ256rikz, X86::VPSRAWZ256mikz, 0 }, + { X86::VPSRLDZ256rikz, X86::VPSRLDZ256mikz, 0 }, + { X86::VPSRLQZ256rikz, X86::VPSRLQZ256mikz, 0 }, + { X86::VPSRLWZ256rikz, X86::VPSRLWZ256mikz, 0 }, // AVX-512VL 128-bit masked foldable instructions + { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + { X86::VPABSBZ128rrkz, X86::VPABSBZ128rmkz, 0 }, + { X86::VPABSDZ128rrkz, X86::VPABSDZ128rmkz, 0 }, + { X86::VPABSQZ128rrkz, X86::VPABSQZ128rmkz, 0 }, + { X86::VPABSWZ128rrkz, X86::VPABSWZ128rmkz, 0 }, { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 }, { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 }, { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, @@ -2189,6 +2392,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 }, { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 }, { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 }, + { X86::VPSLLDZ128rikz, X86::VPSLLDZ128mikz, 0 }, + { X86::VPSLLQZ128rikz, X86::VPSLLQZ128mikz, 0 }, + { X86::VPSLLWZ128rikz, X86::VPSLLWZ128mikz, 0 }, + { X86::VPSRADZ128rikz, X86::VPSRADZ128mikz, 0 }, + { X86::VPSRAQZ128rikz, X86::VPSRAQZ128mikz, 0 }, + { X86::VPSRAWZ128rikz, X86::VPSRAWZ128mikz, 0 }, + { X86::VPSRLDZ128rikz, X86::VPSRLDZ128mikz, 0 }, + { X86::VPSRLQZ128rikz, X86::VPSRLQZ128mikz, 0 }, + { X86::VPSRLWZ128rikz, X86::VPSRLWZ128mikz, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, @@ -2262,23 +2474,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, - { X86::VPCMOVrrrY, X86::VPCMOVrrmY, 0 }, + { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 }, { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, - { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 }, + { X86::VPERMIL2PDYrr, X86::VPERMIL2PDYrm, 0 }, { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, - { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, + { X86::VPERMIL2PSYrr, X86::VPERMIL2PSYrm, 0 }, { X86::VPPERMrrr, X86::VPPERMrrm, 0 }, // AVX-512 instructions with 3 source operands. - { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, - { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, - { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, - { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 }, - { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, { X86::VPERMI2Brr, X86::VPERMI2Brm, 0 }, { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, @@ -2329,6 +2532,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 masked instructions { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE }, { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 }, { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 }, { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 }, @@ -2337,6 +2542,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 }, { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 }, { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 }, @@ -2349,14 +2556,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 }, + { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 }, { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 }, + { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 }, { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE }, { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 }, { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 }, + { X86::VPACKSSDWZrrkz, X86::VPACKSSDWZrmkz, 0 }, + { X86::VPACKSSWBZrrkz, X86::VPACKSSWBZrmkz, 0 }, + { X86::VPACKUSDWZrrkz, X86::VPACKUSDWZrmkz, 0 }, + { X86::VPACKUSWBZrrkz, X86::VPACKUSWBZrmkz, 0 }, { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 }, { X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 }, { X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 }, @@ -2370,6 +2587,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 }, { X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 }, { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, + { X86::VPAVGBZrrkz, X86::VPAVGBZrmkz, 0 }, + { X86::VPAVGWZrrkz, X86::VPAVGWZrmkz, 0 }, { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 }, { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 }, { X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 }, @@ -2380,9 +2599,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 }, { X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 }, { X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 }, + { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 }, + { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 }, + { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 }, + { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 }, + { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 }, + { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 }, + { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 }, + { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 }, + { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 }, + { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 }, + { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 }, + { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 }, + { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 }, + { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 }, + { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 }, + { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 }, + { X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 }, + { X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 }, + { X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 }, + { X86::VPMULDQZrrkz, X86::VPMULDQZrmkz, 0 }, + { X86::VPMULUDQZrrkz, X86::VPMULUDQZrmkz, 0 }, { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 }, { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 }, { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 }, + { X86::VPSLLDZrrkz, X86::VPSLLDZrmkz, 0 }, + { X86::VPSLLQZrrkz, X86::VPSLLQZrmkz, 0 }, + { X86::VPSLLVDZrrkz, X86::VPSLLVDZrmkz, 0 }, + { X86::VPSLLVQZrrkz, X86::VPSLLVQZrmkz, 0 }, + { X86::VPSLLVWZrrkz, X86::VPSLLVWZrmkz, 0 }, + { X86::VPSLLWZrrkz, X86::VPSLLWZrmkz, 0 }, + { X86::VPSRADZrrkz, X86::VPSRADZrmkz, 0 }, + { X86::VPSRAQZrrkz, X86::VPSRAQZrmkz, 0 }, + { X86::VPSRAVDZrrkz, X86::VPSRAVDZrmkz, 0 }, + { X86::VPSRAVQZrrkz, X86::VPSRAVQZrmkz, 0 }, + { X86::VPSRAVWZrrkz, X86::VPSRAVWZrmkz, 0 }, + { X86::VPSRAWZrrkz, X86::VPSRAWZrmkz, 0 }, + { X86::VPSRLDZrrkz, X86::VPSRLDZrmkz, 0 }, + { X86::VPSRLQZrrkz, X86::VPSRLQZrmkz, 0 }, + { X86::VPSRLVDZrrkz, X86::VPSRLVDZrmkz, 0 }, + { X86::VPSRLVQZrrkz, X86::VPSRLVQZrmkz, 0 }, + { X86::VPSRLVWZrrkz, X86::VPSRLVWZrmkz, 0 }, + { X86::VPSRLWZrrkz, X86::VPSRLWZrmkz, 0 }, { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 }, { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 }, { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 }, @@ -2401,8 +2659,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 }, { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 }, { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, + { X86::VSHUFPDZrrikz, X86::VSHUFPDZrmikz, 0 }, + { X86::VSHUFPSZrrikz, X86::VSHUFPSZrmikz, 0 }, { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE }, { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, @@ -2437,6 +2699,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 }, + { X86::VPACKSSDWZ256rrkz, X86::VPACKSSDWZ256rmkz, 0 }, + { X86::VPACKSSWBZ256rrkz, X86::VPACKSSWBZ256rmkz, 0 }, + { X86::VPACKUSDWZ256rrkz, X86::VPACKUSDWZ256rmkz, 0 }, + { X86::VPACKUSWBZ256rrkz, X86::VPACKUSWBZ256rmkz, 0 }, { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 }, { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 }, { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 }, @@ -2450,6 +2716,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 }, { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 }, { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, + { X86::VPAVGBZ256rrkz, X86::VPAVGBZ256rmkz, 0 }, + { X86::VPAVGWZ256rrkz, X86::VPAVGWZ256rmkz, 0 }, { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 }, { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 }, { X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 }, @@ -2460,9 +2728,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 }, { X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 }, { X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 }, + { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 }, + { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 }, + { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 }, + { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 }, + { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 }, + { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 }, + { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 }, + { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 }, + { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 }, + { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 }, + { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 }, + { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 }, + { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 }, + { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 }, + { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 }, + { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 }, + { X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 }, + { X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 }, + { X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 }, + { X86::VPMULLWZ256rrkz, X86::VPMULLWZ256rmkz, 0 }, + { X86::VPMULUDQZ256rrkz, X86::VPMULUDQZ256rmkz, 0 }, { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 }, + { X86::VPSLLDZ256rrkz, X86::VPSLLDZ256rmkz, 0 }, + { X86::VPSLLQZ256rrkz, X86::VPSLLQZ256rmkz, 0 }, + { X86::VPSLLVDZ256rrkz, X86::VPSLLVDZ256rmkz, 0 }, + { X86::VPSLLVQZ256rrkz, X86::VPSLLVQZ256rmkz, 0 }, + { X86::VPSLLVWZ256rrkz, X86::VPSLLVWZ256rmkz, 0 }, + { X86::VPSLLWZ256rrkz, X86::VPSLLWZ256rmkz, 0 }, + { X86::VPSRADZ256rrkz, X86::VPSRADZ256rmkz, 0 }, + { X86::VPSRAQZ256rrkz, X86::VPSRAQZ256rmkz, 0 }, + { X86::VPSRAVDZ256rrkz, X86::VPSRAVDZ256rmkz, 0 }, + { X86::VPSRAVQZ256rrkz, X86::VPSRAVQZ256rmkz, 0 }, + { X86::VPSRAVWZ256rrkz, X86::VPSRAVWZ256rmkz, 0 }, + { X86::VPSRAWZ256rrkz, X86::VPSRAWZ256rmkz, 0 }, + { X86::VPSRLDZ256rrkz, X86::VPSRLDZ256rmkz, 0 }, + { X86::VPSRLQZ256rrkz, X86::VPSRLQZ256rmkz, 0 }, + { X86::VPSRLVDZ256rrkz, X86::VPSRLVDZ256rmkz, 0 }, + { X86::VPSRLVQZ256rrkz, X86::VPSRLVQZ256rmkz, 0 }, + { X86::VPSRLVWZ256rrkz, X86::VPSRLVWZ256rmkz, 0 }, + { X86::VPSRLWZ256rrkz, X86::VPSRLWZ256rmkz, 0 }, { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, @@ -2481,6 +2788,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 }, { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, + { X86::VSHUFPDZ256rrikz, X86::VSHUFPDZ256rmikz, 0 }, + { X86::VSHUFPSZ256rrikz, X86::VSHUFPSZ256rmikz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, @@ -2513,6 +2822,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 }, + { X86::VPACKSSDWZ128rrkz, X86::VPACKSSDWZ128rmkz, 0 }, + { X86::VPACKSSWBZ128rrkz, X86::VPACKSSWBZ128rmkz, 0 }, + { X86::VPACKUSDWZ128rrkz, X86::VPACKUSDWZ128rmkz, 0 }, + { X86::VPACKUSWBZ128rrkz, X86::VPACKUSWBZ128rmkz, 0 }, { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 }, { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 }, { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 }, @@ -2526,15 +2839,56 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 }, { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, + { X86::VPAVGBZ128rrkz, X86::VPAVGBZ128rmkz, 0 }, + { X86::VPAVGWZ128rrkz, X86::VPAVGWZ128rmkz, 0 }, { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 }, { X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 }, { X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 }, { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 }, { X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 }, { X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 }, + { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 }, + { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 }, + { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 }, + { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 }, + { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 }, + { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 }, + { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 }, + { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 }, + { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 }, + { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 }, + { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 }, + { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 }, + { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 }, + { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 }, + { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 }, + { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 }, + { X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 }, + { X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 }, + { X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 }, + { X86::VPMULLWZ128rrkz, X86::VPMULLWZ128rmkz, 0 }, + { X86::VPMULUDQZ128rrkz, X86::VPMULUDQZ128rmkz, 0 }, { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 }, + { X86::VPSLLDZ128rrkz, X86::VPSLLDZ128rmkz, 0 }, + { X86::VPSLLQZ128rrkz, X86::VPSLLQZ128rmkz, 0 }, + { X86::VPSLLVDZ128rrkz, X86::VPSLLVDZ128rmkz, 0 }, + { X86::VPSLLVQZ128rrkz, X86::VPSLLVQZ128rmkz, 0 }, + { X86::VPSLLVWZ128rrkz, X86::VPSLLVWZ128rmkz, 0 }, + { X86::VPSLLWZ128rrkz, X86::VPSLLWZ128rmkz, 0 }, + { X86::VPSRADZ128rrkz, X86::VPSRADZ128rmkz, 0 }, + { X86::VPSRAQZ128rrkz, X86::VPSRAQZ128rmkz, 0 }, + { X86::VPSRAVDZ128rrkz, X86::VPSRAVDZ128rmkz, 0 }, + { X86::VPSRAVQZ128rrkz, X86::VPSRAVQZ128rmkz, 0 }, + { X86::VPSRAVWZ128rrkz, X86::VPSRAVWZ128rmkz, 0 }, + { X86::VPSRAWZ128rrkz, X86::VPSRAWZ128rmkz, 0 }, + { X86::VPSRLDZ128rrkz, X86::VPSRLDZ128rmkz, 0 }, + { X86::VPSRLQZ128rrkz, X86::VPSRLQZ128rmkz, 0 }, + { X86::VPSRLVDZ128rrkz, X86::VPSRLVDZ128rmkz, 0 }, + { X86::VPSRLVQZ128rrkz, X86::VPSRLVQZ128rmkz, 0 }, + { X86::VPSRLVWZ128rrkz, X86::VPSRLVWZ128rmkz, 0 }, + { X86::VPSRLWZ128rrkz, X86::VPSRLWZ128rmkz, 0 }, { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, @@ -2553,6 +2907,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 }, { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, + { X86::VSHUFPDZ128rrikz, X86::VSHUFPDZ128rmikz, 0 }, + { X86::VSHUFPSZ128rrikz, X86::VSHUFPSZ128rmikz, 0 }, { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, @@ -2563,6 +2919,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, // AVX-512 masked foldable instructions + { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, + { X86::VPABSBZrrk, X86::VPABSBZrmk, 0 }, + { X86::VPABSDZrrk, X86::VPABSDZrmk, 0 }, + { X86::VPABSQZrrk, X86::VPABSQZrmk, 0 }, + { X86::VPABSWZrrk, X86::VPABSWZrmk, 0 }, { X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 }, { X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 }, { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 }, @@ -2582,8 +2944,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 }, { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 }, { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, + { X86::VPSLLDZrik, X86::VPSLLDZmik, 0 }, + { X86::VPSLLQZrik, X86::VPSLLQZmik, 0 }, + { X86::VPSLLWZrik, X86::VPSLLWZmik, 0 }, + { X86::VPSRADZrik, X86::VPSRADZmik, 0 }, + { X86::VPSRAQZrik, X86::VPSRAQZmik, 0 }, + { X86::VPSRAWZrik, X86::VPSRAWZmik, 0 }, + { X86::VPSRLDZrik, X86::VPSRLDZmik, 0 }, + { X86::VPSRLQZrik, X86::VPSRLQZmik, 0 }, + { X86::VPSRLWZrik, X86::VPSRLWZmik, 0 }, // AVX-512VL 256-bit masked foldable instructions + { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, + { X86::VPABSBZ256rrk, X86::VPABSBZ256rmk, 0 }, + { X86::VPABSDZ256rrk, X86::VPABSDZ256rmk, 0 }, + { X86::VPABSQZ256rrk, X86::VPABSQZ256rmk, 0 }, + { X86::VPABSWZ256rrk, X86::VPABSWZ256rmk, 0 }, { X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 }, { X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 }, { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 }, @@ -2603,8 +2980,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 }, { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 }, { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, + { X86::VPSLLDZ256rik, X86::VPSLLDZ256mik, 0 }, + { X86::VPSLLQZ256rik, X86::VPSLLQZ256mik, 0 }, + { X86::VPSLLWZ256rik, X86::VPSLLWZ256mik, 0 }, + { X86::VPSRADZ256rik, X86::VPSRADZ256mik, 0 }, + { X86::VPSRAQZ256rik, X86::VPSRAQZ256mik, 0 }, + { X86::VPSRAWZ256rik, X86::VPSRAWZ256mik, 0 }, + { X86::VPSRLDZ256rik, X86::VPSRLDZ256mik, 0 }, + { X86::VPSRLQZ256rik, X86::VPSRLQZ256mik, 0 }, + { X86::VPSRLWZ256rik, X86::VPSRLWZ256mik, 0 }, // AVX-512VL 128-bit masked foldable instructions + { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, + { X86::VPABSBZ128rrk, X86::VPABSBZ128rmk, 0 }, + { X86::VPABSDZ128rrk, X86::VPABSDZ128rmk, 0 }, + { X86::VPABSQZ128rrk, X86::VPABSQZ128rmk, 0 }, + { X86::VPABSWZ128rrk, X86::VPABSWZ128rmk, 0 }, { X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 }, { X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 }, { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, @@ -2622,6 +3013,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 }, { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 }, { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 }, + { X86::VPSLLDZ128rik, X86::VPSLLDZ128mik, 0 }, + { X86::VPSLLQZ128rik, X86::VPSLLQZ128mik, 0 }, + { X86::VPSLLWZ128rik, X86::VPSLLWZ128mik, 0 }, + { X86::VPSRADZ128rik, X86::VPSRADZ128mik, 0 }, + { X86::VPSRAQZ128rik, X86::VPSRAQZ128mik, 0 }, + { X86::VPSRAWZ128rik, X86::VPSRAWZ128mik, 0 }, + { X86::VPSRLDZ128rik, X86::VPSRLDZ128mik, 0 }, + { X86::VPSRLQZ128rik, X86::VPSRLQZ128mik, 0 }, + { X86::VPSRLWZ128rik, X86::VPSRLWZ128mik, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { @@ -2651,6 +3051,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable masked instructions { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE }, + { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE }, { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 }, { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 }, { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 }, @@ -2659,6 +3061,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, + { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 }, { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 }, { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 }, @@ -2671,14 +3075,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 }, + { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 }, { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 }, + { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 }, { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE }, { X86::VORPDZrrk, X86::VORPDZrmk, 0 }, { X86::VORPSZrrk, X86::VORPSZrmk, 0 }, + { X86::VPACKSSDWZrrk, X86::VPACKSSDWZrmk, 0 }, + { X86::VPACKSSWBZrrk, X86::VPACKSSWBZrmk, 0 }, + { X86::VPACKUSDWZrrk, X86::VPACKUSDWZrmk, 0 }, + { X86::VPACKUSWBZrrk, X86::VPACKUSWBZrmk, 0 }, { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 }, { X86::VPADDDZrrk, X86::VPADDDZrmk, 0 }, { X86::VPADDQZrrk, X86::VPADDQZrmk, 0 }, @@ -2692,6 +3106,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 }, { X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 }, { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, + { X86::VPAVGBZrrk, X86::VPAVGBZrmk, 0 }, + { X86::VPAVGWZrrk, X86::VPAVGWZrmk, 0 }, { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 }, @@ -2714,9 +3130,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, + { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 }, + { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 }, + { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 }, + { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 }, + { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 }, + { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 }, + { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 }, + { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 }, + { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 }, + { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 }, + { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 }, + { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 }, + { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 }, + { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 }, + { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 }, + { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 }, + { X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 }, + { X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 }, + { X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 }, + { X86::VPMULLWZrrk, X86::VPMULLWZrmk, 0 }, + { X86::VPMULUDQZrrk, X86::VPMULUDQZrmk, 0 }, { X86::VPORDZrrk, X86::VPORDZrmk, 0 }, { X86::VPORQZrrk, X86::VPORQZrmk, 0 }, { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 }, + { X86::VPSLLDZrrk, X86::VPSLLDZrmk, 0 }, + { X86::VPSLLQZrrk, X86::VPSLLQZrmk, 0 }, + { X86::VPSLLVDZrrk, X86::VPSLLVDZrmk, 0 }, + { X86::VPSLLVQZrrk, X86::VPSLLVQZrmk, 0 }, + { X86::VPSLLVWZrrk, X86::VPSLLVWZrmk, 0 }, + { X86::VPSLLWZrrk, X86::VPSLLWZrmk, 0 }, + { X86::VPSRADZrrk, X86::VPSRADZrmk, 0 }, + { X86::VPSRAQZrrk, X86::VPSRAQZrmk, 0 }, + { X86::VPSRAVDZrrk, X86::VPSRAVDZrmk, 0 }, + { X86::VPSRAVQZrrk, X86::VPSRAVQZrmk, 0 }, + { X86::VPSRAVWZrrk, X86::VPSRAVWZrmk, 0 }, + { X86::VPSRAWZrrk, X86::VPSRAWZrmk, 0 }, + { X86::VPSRLDZrrk, X86::VPSRLDZrmk, 0 }, + { X86::VPSRLQZrrk, X86::VPSRLQZrmk, 0 }, + { X86::VPSRLVDZrrk, X86::VPSRLVDZrmk, 0 }, + { X86::VPSRLVQZrrk, X86::VPSRLVQZrmk, 0 }, + { X86::VPSRLVWZrrk, X86::VPSRLVWZrmk, 0 }, + { X86::VPSRLWZrrk, X86::VPSRLWZrmk, 0 }, { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 }, { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 }, { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 }, @@ -2736,8 +3191,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 }, { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 }, { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, + { X86::VSHUFPDZrrik, X86::VSHUFPDZrmik, 0 }, + { X86::VSHUFPSZrrik, X86::VSHUFPSZrmik, 0 }, { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE }, + { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE }, { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, @@ -2772,6 +3231,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 }, + { X86::VPACKSSDWZ256rrk, X86::VPACKSSDWZ256rmk, 0 }, + { X86::VPACKSSWBZ256rrk, X86::VPACKSSWBZ256rmk, 0 }, + { X86::VPACKUSDWZ256rrk, X86::VPACKUSDWZ256rmk, 0 }, + { X86::VPACKUSWBZ256rrk, X86::VPACKUSWBZ256rmk, 0 }, { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 }, { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 }, { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 }, @@ -2785,6 +3248,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 }, { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 }, { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, + { X86::VPAVGBZ256rrk, X86::VPAVGBZ256rmk, 0 }, + { X86::VPAVGWZ256rrk, X86::VPAVGWZ256rmk, 0 }, { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 }, @@ -2807,9 +3272,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, + { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 }, + { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 }, + { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 }, + { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 }, + { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 }, + { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 }, + { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 }, + { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 }, + { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 }, + { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 }, + { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 }, + { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 }, + { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 }, + { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 }, + { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 }, + { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 }, + { X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 }, + { X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 }, + { X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 }, + { X86::VPMULLWZ256rrk, X86::VPMULLWZ256rmk, 0 }, + { X86::VPMULUDQZ256rrk, X86::VPMULUDQZ256rmk, 0 }, { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 }, + { X86::VPSLLDZ256rrk, X86::VPSLLDZ256rmk, 0 }, + { X86::VPSLLQZ256rrk, X86::VPSLLQZ256rmk, 0 }, + { X86::VPSLLVDZ256rrk, X86::VPSLLVDZ256rmk, 0 }, + { X86::VPSLLVQZ256rrk, X86::VPSLLVQZ256rmk, 0 }, + { X86::VPSLLVWZ256rrk, X86::VPSLLVWZ256rmk, 0 }, + { X86::VPSLLWZ256rrk, X86::VPSLLWZ256rmk, 0 }, + { X86::VPSRADZ256rrk, X86::VPSRADZ256rmk, 0 }, + { X86::VPSRAQZ256rrk, X86::VPSRAQZ256rmk, 0 }, + { X86::VPSRAVDZ256rrk, X86::VPSRAVDZ256rmk, 0 }, + { X86::VPSRAVQZ256rrk, X86::VPSRAVQZ256rmk, 0 }, + { X86::VPSRAVWZ256rrk, X86::VPSRAVWZ256rmk, 0 }, + { X86::VPSRAWZ256rrk, X86::VPSRAWZ256rmk, 0 }, + { X86::VPSRLDZ256rrk, X86::VPSRLDZ256rmk, 0 }, + { X86::VPSRLQZ256rrk, X86::VPSRLQZ256rmk, 0 }, + { X86::VPSRLVDZ256rrk, X86::VPSRLVDZ256rmk, 0 }, + { X86::VPSRLVQZ256rrk, X86::VPSRLVQZ256rmk, 0 }, + { X86::VPSRLVWZ256rrk, X86::VPSRLVWZ256rmk, 0 }, + { X86::VPSRLWZ256rrk, X86::VPSRLWZ256rmk, 0 }, { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, @@ -2830,6 +3334,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 }, { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 }, { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 }, + { X86::VSHUFPDZ256rrik, X86::VSHUFPDZ256rmik, 0 }, + { X86::VSHUFPSZ256rrik, X86::VSHUFPSZ256rmik, 0 }, { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, @@ -2862,6 +3368,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 }, + { X86::VPACKSSDWZ128rrk, X86::VPACKSSDWZ128rmk, 0 }, + { X86::VPACKSSWBZ128rrk, X86::VPACKSSWBZ128rmk, 0 }, + { X86::VPACKUSDWZ128rrk, X86::VPACKUSDWZ128rmk, 0 }, + { X86::VPACKUSWBZ128rrk, X86::VPACKUSWBZ128rmk, 0 }, { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 }, { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 }, { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 }, @@ -2875,6 +3385,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 }, { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, + { X86::VPAVGBZ128rrk, X86::VPAVGBZ128rmk, 0 }, + { X86::VPAVGWZ128rrk, X86::VPAVGWZ128rmk, 0 }, { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 }, { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 }, @@ -2893,9 +3405,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, + { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 }, + { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 }, + { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 }, + { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 }, + { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 }, + { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 }, + { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 }, + { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 }, + { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 }, + { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 }, + { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 }, + { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 }, + { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 }, + { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 }, + { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 }, + { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 }, + { X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 }, + { X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 }, + { X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 }, + { X86::VPMULLWZ128rrk, X86::VPMULLWZ128rmk, 0 }, + { X86::VPMULUDQZ128rrk, X86::VPMULUDQZ128rmk, 0 }, { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 }, + { X86::VPSLLDZ128rrk, X86::VPSLLDZ128rmk, 0 }, + { X86::VPSLLQZ128rrk, X86::VPSLLQZ128rmk, 0 }, + { X86::VPSLLVDZ128rrk, X86::VPSLLVDZ128rmk, 0 }, + { X86::VPSLLVQZ128rrk, X86::VPSLLVQZ128rmk, 0 }, + { X86::VPSLLVWZ128rrk, X86::VPSLLVWZ128rmk, 0 }, + { X86::VPSLLWZ128rrk, X86::VPSLLWZ128rmk, 0 }, + { X86::VPSRADZ128rrk, X86::VPSRADZ128rmk, 0 }, + { X86::VPSRAQZ128rrk, X86::VPSRAQZ128rmk, 0 }, + { X86::VPSRAVDZ128rrk, X86::VPSRAVDZ128rmk, 0 }, + { X86::VPSRAVQZ128rrk, X86::VPSRAVQZ128rmk, 0 }, + { X86::VPSRAVWZ128rrk, X86::VPSRAVWZ128rmk, 0 }, + { X86::VPSRAWZ128rrk, X86::VPSRAWZ128rmk, 0 }, + { X86::VPSRLDZ128rrk, X86::VPSRLDZ128rmk, 0 }, + { X86::VPSRLQZ128rrk, X86::VPSRLQZ128rmk, 0 }, + { X86::VPSRLVDZ128rrk, X86::VPSRLVDZ128rmk, 0 }, + { X86::VPSRLVQZ128rrk, X86::VPSRLVQZ128rmk, 0 }, + { X86::VPSRLVWZ128rrk, X86::VPSRLVWZ128rmk, 0 }, + { X86::VPSRLWZ128rrk, X86::VPSRLWZ128rmk, 0 }, { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, @@ -2916,6 +3467,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 }, { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 }, { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 }, + { X86::VSHUFPDZ128rrik, X86::VSHUFPDZ128rmik, 0 }, + { X86::VSHUFPSZ128rrik, X86::VSHUFPSZ128rmik, 0 }, { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, @@ -3063,18 +3616,13 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); - if (MI.getOpcode() == getCallFrameSetupOpcode() || - MI.getOpcode() == getCallFrameDestroyOpcode()) { + if (isFrameInstr(MI)) { unsigned StackAlign = TFI->getStackAlignment(); - int SPAdj = - (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign; - - SPAdj -= MI.getOperand(1).getImm(); - - if (MI.getOpcode() == getCallFrameSetupOpcode()) - return SPAdj; - else - return -SPAdj; + int SPAdj = alignTo(getFrameSize(MI), StackAlign); + SPAdj -= getFrameAdjustment(MI); + if (!isFrameSetup(MI)) + SPAdj = -SPAdj; + return SPAdj; } // To know whether a call adjusts the stack, we need information @@ -3569,7 +4117,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, const DebugLoc &DL = Orig.getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)) - .addOperand(Orig.getOperand(0)) + .add(Orig.getOperand(0)) .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); @@ -3654,10 +4202,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. NewSrc = MF.getRegInfo().createVirtualRegister(RC); - MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), - get(TargetOpcode::COPY)) - .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) - .addOperand(Src); + MachineInstr *Copy = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) + .add(Src); // Which is obviously going to be dead after we're done with it. isKill = true; @@ -3823,10 +4371,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return nullptr; NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) - .addOperand(Dest) + .add(Dest) .addReg(0) .addImm(1ULL << ShAmt) - .addOperand(Src) + .add(Src) .addImm(0) .addReg(0); break; @@ -3848,14 +4396,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .addOperand(Dest) + .add(Dest) .addReg(0) .addImm(1ULL << ShAmt) .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) .addImm(0) .addReg(0); if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MIB.add(ImplicitOp); NewMI = MIB; break; @@ -3869,10 +4417,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest) + .add(Dest) .addReg(0) .addImm(1ULL << ShAmt) - .addOperand(Src) + .add(Src) .addImm(0) .addReg(0); break; @@ -3891,11 +4439,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .addOperand(Dest) + .add(Dest) .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MIB.add(ImplicitOp); NewMI = addOffset(MIB, 1); break; @@ -3905,10 +4453,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest) - .addOperand(Src), - 1); + NewMI = addOffset( + BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), 1); break; case X86::DEC64r: case X86::DEC32r: { @@ -3924,11 +4470,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .addOperand(Dest) + .add(Dest) .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MIB.add(ImplicitOp); NewMI = addOffset(MIB, -1); @@ -3939,10 +4485,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest) - .addOperand(Src), - -1); + NewMI = addOffset( + BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), -1); break; case X86::ADD64rr: case X86::ADD64rr_DB: @@ -3970,12 +4514,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg2, isKill2, isUndef2, ImplicitOp2, LV)) return nullptr; - MachineInstrBuilder MIB = - BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest); + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest); if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MIB.add(ImplicitOp); if (ImplicitOp2.getReg() != 0) - MIB.addOperand(ImplicitOp2); + MIB.add(ImplicitOp2); NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); @@ -3995,9 +4538,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Src2 = MI.getOperand(2).getReg(); bool isKill2 = MI.getOperand(2).isKill(); - NewMI = addRegReg( - BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); + NewMI = addRegReg(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); // Preserve undefness of the operands. bool isUndef = MI.getOperand(1).isUndef(); @@ -4014,10 +4556,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD64ri32_DB: case X86::ADD64ri8_DB: assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) - .addOperand(Dest) - .addOperand(Src), - MI.getOperand(2)); + NewMI = addOffset( + BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src), + MI.getOperand(2)); break; case X86::ADD32ri: case X86::ADD32ri8: @@ -4034,11 +4575,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) - .addOperand(Dest) + .add(Dest) .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MIB.add(ImplicitOp); NewMI = addOffset(MIB, MI.getOperand(2)); break; @@ -4051,12 +4592,136 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest) - .addOperand(Src), - MI.getOperand(2)); + NewMI = addOffset( + BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), + MI.getOperand(2)); + break; + + case X86::VMOVDQU8Z128rmk: + case X86::VMOVDQU8Z256rmk: + case X86::VMOVDQU8Zrmk: + case X86::VMOVDQU16Z128rmk: + case X86::VMOVDQU16Z256rmk: + case X86::VMOVDQU16Zrmk: + case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk: + case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk: + case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk: + case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk: + case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk: + case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk: + case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk: + case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk: + case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk: + case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: + case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: + case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: { + unsigned Opc; + switch (MIOpc) { + default: llvm_unreachable("Unreachable!"); + case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; + case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; + case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; + case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; + case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; + case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; + case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + } + + NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .add(MI.getOperand(2)) + .add(Src) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)) + .add(MI.getOperand(6)) + .add(MI.getOperand(7)); break; } + case X86::VMOVDQU8Z128rrk: + case X86::VMOVDQU8Z256rrk: + case X86::VMOVDQU8Zrrk: + case X86::VMOVDQU16Z128rrk: + case X86::VMOVDQU16Z256rrk: + case X86::VMOVDQU16Zrrk: + case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk: + case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk: + case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk: + case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk: + case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk: + case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk: + case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk: + case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk: + case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk: + case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk: + case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk: + case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: { + unsigned Opc; + switch (MIOpc) { + default: llvm_unreachable("Unreachable!"); + case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break; + case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break; + case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break; + case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break; + case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break; + case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break; + case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; + case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; + case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break; + case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; + case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; + case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break; + case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; + case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; + case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break; + case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; + case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; + case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break; + case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break; + case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break; + case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break; + case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break; + case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break; + case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break; + case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break; + case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break; + case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break; + case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break; + case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break; + case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break; + } + + NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .add(Dest) + .add(MI.getOperand(2)) + .add(Src) + .add(MI.getOperand(3)); + break; + } + } if (!NewMI) return nullptr; @@ -4337,6 +5002,18 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::PFSUBrr: + case X86::PFSUBRrr: { + // PFSUB x, y: x = x - y + // PFSUBR x, y: x = y - x + unsigned Opc = + (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + break; + } case X86::BLENDPDrri: case X86::BLENDPSrri: case X86::PBLENDWrri: @@ -4606,18 +5283,30 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: - case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: - case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: - case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: - case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: - case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: - case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrik: + case X86::VPTERNLOGDZ128rrik: + case X86::VPTERNLOGDZ256rrik: + case X86::VPTERNLOGQZrrik: + case X86::VPTERNLOGQZ128rrik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: - case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: { + case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: + case X86::VPTERNLOGDZ128rmbi: + case X86::VPTERNLOGDZ256rmbi: + case X86::VPTERNLOGDZrmbi: + case X86::VPTERNLOGQZ128rmbi: + case X86::VPTERNLOGQZ256rmbi: + case X86::VPTERNLOGQZrmbi: + case X86::VPTERNLOGDZ128rmbikz: + case X86::VPTERNLOGDZ256rmbikz: + case X86::VPTERNLOGDZrmbikz: + case X86::VPTERNLOGQZ128rmbikz: + case X86::VPTERNLOGQZ256rmbikz: + case X86::VPTERNLOGQZrmbikz: { auto &WorkingMI = cloneIfNew(MI); if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2)) return nullptr; @@ -4798,18 +5487,30 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi: case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi: case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi: - case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik: - case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik: - case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik: - case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik: - case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik: - case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik: + case X86::VPTERNLOGDZrrik: + case X86::VPTERNLOGDZ128rrik: + case X86::VPTERNLOGDZ256rrik: + case X86::VPTERNLOGQZrrik: + case X86::VPTERNLOGQZ128rrik: + case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz: case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz: case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz: case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz: case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz: case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: + case X86::VPTERNLOGDZ128rmbi: + case X86::VPTERNLOGDZ256rmbi: + case X86::VPTERNLOGDZrmbi: + case X86::VPTERNLOGQZ128rmbi: + case X86::VPTERNLOGQZ256rmbi: + case X86::VPTERNLOGQZrmbi: + case X86::VPTERNLOGDZ128rmbikz: + case X86::VPTERNLOGDZ256rmbikz: + case X86::VPTERNLOGDZrmbikz: + case X86::VPTERNLOGQZ128rmbikz: + case X86::VPTERNLOGQZ256rmbikz: + case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); default: const X86InstrFMA3Group *FMA3Group = @@ -5108,6 +5809,95 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const { return !isPredicated(MI); } +bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + return true; + default: + return false; + } +} + +bool X86InstrInfo::canMakeTailCallConditional( + SmallVectorImpl<MachineOperand> &BranchCond, + const MachineInstr &TailCall) const { + if (TailCall.getOpcode() != X86::TCRETURNdi && + TailCall.getOpcode() != X86::TCRETURNdi64) { + // Only direct calls can be done with a conditional branch. + return false; + } + + const MachineFunction *MF = TailCall.getParent()->getParent(); + if (Subtarget.isTargetWin64() && MF->hasWinCFI()) { + // Conditional tail calls confuse the Win64 unwinder. + return false; + } + + assert(BranchCond.size() == 1); + if (BranchCond[0].getImm() > X86::LAST_VALID_COND) { + // Can't make a conditional tail call with this condition. + return false; + } + + const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + if (X86FI->getTCReturnAddrDelta() != 0 || + TailCall.getOperand(1).getImm() != 0) { + // A conditional tail call cannot do any stack adjustment. + return false; + } + + return true; +} + +void X86InstrInfo::replaceBranchWithTailCall( + MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond, + const MachineInstr &TailCall) const { + assert(canMakeTailCallConditional(BranchCond, TailCall)); + + MachineBasicBlock::iterator I = MBB.end(); + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + if (!I->isBranch()) + assert(0 && "Can't find the branch to replace!"); + + X86::CondCode CC = getCondFromBranchOpc(I->getOpcode()); + assert(BranchCond.size() == 1); + if (CC != BranchCond[0].getImm()) + continue; + + break; + } + + unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc + : X86::TCRETURNdi64cc; + + auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc)); + MIB->addOperand(TailCall.getOperand(0)); // Destination. + MIB.addImm(0); // Stack offset (not used). + MIB->addOperand(BranchCond[0]); // Condition. + MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters. + + // Add implicit uses and defs of all live regs potentially clobbered by the + // call. This way they still appear live across the call. + LivePhysRegs LiveRegs(&getRegisterInfo()); + LiveRegs.addLiveOuts(MBB); + SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers; + LiveRegs.stepForward(*MIB, Clobbers); + for (const auto &C : Clobbers) { + MIB.addReg(C.first, RegState::Implicit); + MIB.addReg(C.first, RegState::Implicit | RegState::Define); + } + + I->eraseFromParent(); +} + // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may // not be a fallthrough MBB now due to layout changes). Return nullptr if the // fallthrough MBB cannot be identified. @@ -5514,8 +6304,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg, // SrcReg(MaskReg) -> DestReg(GR64) // SrcReg(MaskReg) -> DestReg(GR32) - // SrcReg(MaskReg) -> DestReg(GR16) - // SrcReg(MaskReg) -> DestReg(GR8) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(SrcReg)) { @@ -5525,20 +6313,10 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg, } if (X86::GR32RegClass.contains(DestReg)) return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk; - if (X86::GR16RegClass.contains(DestReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return X86::KMOVWrk; - } - if (X86::GR8RegClass.contains(DestReg)) { - DestReg = getX86SubSuperRegister(DestReg, 32); - return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk; - } } // SrcReg(GR64) -> DestReg(MaskReg) // SrcReg(GR32) -> DestReg(MaskReg) - // SrcReg(GR16) -> DestReg(MaskReg) - // SrcReg(GR8) -> DestReg(MaskReg) // All KMASK RegClasses hold the same k registers, can be tested against anyone. if (X86::VK16RegClass.contains(DestReg)) { @@ -5548,14 +6326,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg, } if (X86::GR32RegClass.contains(SrcReg)) return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr; - if (X86::GR16RegClass.contains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return X86::KMOVWkr; - } - if (X86::GR8RegClass.contains(SrcReg)) { - SrcReg = getX86SubSuperRegister(SrcReg, 32); - return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr; - } } @@ -5965,7 +6735,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); + MIB.add(Addr[i]); MIB.addReg(SrcReg, getKillRegState(isKill)); (*MIB).setMemRefs(MMOBegin, MMOEnd); NewMIs.push_back(MIB); @@ -6000,7 +6770,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.addOperand(Addr[i]); + MIB.add(Addr[i]); (*MIB).setMemRefs(MMOBegin, MMOEnd); NewMIs.push_back(MIB); } @@ -6017,12 +6787,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: - if (!MI.getOperand(1).isImm()) - return false; SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; - CmpMask = ~0; - CmpValue = MI.getOperand(1).getImm(); + if (MI.getOperand(1).isImm()) { + CmpMask = ~0; + CmpValue = MI.getOperand(1).getImm(); + } else { + CmpMask = CmpValue = 0; + } return true; // A SUB can be used to perform comparison. case X86::SUB64rm: @@ -6031,7 +6803,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::SUB8rm: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; - CmpMask = ~0; + CmpMask = 0; CmpValue = 0; return true; case X86::SUB64rr: @@ -6040,7 +6812,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::SUB8rr: SrcReg = MI.getOperand(1).getReg(); SrcReg2 = MI.getOperand(2).getReg(); - CmpMask = ~0; + CmpMask = 0; CmpValue = 0; return true; case X86::SUB64ri32: @@ -6050,12 +6822,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: - if (!MI.getOperand(2).isImm()) - return false; SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; - CmpMask = ~0; - CmpValue = MI.getOperand(2).getImm(); + if (MI.getOperand(2).isImm()) { + CmpMask = ~0; + CmpValue = MI.getOperand(2).getImm(); + } else { + CmpMask = CmpValue = 0; + } return true; case X86::CMP64rr: case X86::CMP32rr: @@ -6063,7 +6837,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, case X86::CMP8rr: SrcReg = MI.getOperand(0).getReg(); SrcReg2 = MI.getOperand(1).getReg(); - CmpMask = ~0; + CmpMask = 0; CmpValue = 0; return true; case X86::TEST8rr: @@ -6089,8 +6863,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, /// SrcReg, SrcRegs: register operands for FlagI. /// ImmValue: immediate for FlagI if it takes an immediate. inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg, - unsigned SrcReg2, int ImmValue, - MachineInstr &OI) { + unsigned SrcReg2, int ImmMask, + int ImmValue, MachineInstr &OI) { if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) || (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) || (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) || @@ -6101,7 +6875,8 @@ inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg, OI.getOperand(2).getReg() == SrcReg))) return true; - if (((FlagI.getOpcode() == X86::CMP64ri32 && + if (ImmMask != 0 && + ((FlagI.getOpcode() == X86::CMP64ri32 && OI.getOpcode() == X86::SUB64ri32) || (FlagI.getOpcode() == X86::CMP64ri8 && OI.getOpcode() == X86::SUB64ri8) || @@ -6288,7 +7063,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // If we are comparing against zero, check whether we can use MI to update // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. - bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); + bool IsCmpZero = (CmpMask != 0 && CmpValue == 0); if (IsCmpZero && MI->getParent() != CmpInstr.getParent()) return false; @@ -6338,8 +7113,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, for (; RI != RE; ++RI) { MachineInstr &Instr = *RI; // Check whether CmpInstr can be made redundant by the current instruction. - if (!IsCmpZero && - isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) { + if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, + CmpValue, Instr)) { Sub = &Instr; break; } @@ -6764,14 +7539,33 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { assert(HasAVX && "AVX not supported"); return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); case X86::AVX512_128_SET0: - return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr)); - case X86::AVX512_256_SET0: - return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr)); + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0SD: { + bool HasVLX = Subtarget.hasVLX(); + unsigned SrcReg = MIB->getOperand(0).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) + return Expand2AddrUndef(MIB, + get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); + // Extended register without VLX. Use a larger XOR. + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); + MIB->getOperand(0).setReg(SrcReg); + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + } + case X86::AVX512_256_SET0: { + bool HasVLX = Subtarget.hasVLX(); + unsigned SrcReg = MIB->getOperand(0).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) + return Expand2AddrUndef(MIB, + get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr)); + // Extended register without VLX. Use a larger XOR. + SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); + MIB->getOperand(0).setReg(SrcReg); + return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + } case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); - case X86::AVX512_FsFLD0SS: - case X86::AVX512_FsFLD0SD: - return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -6838,11 +7632,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // registers, since it is not usable as a write mask. // FIXME: A more advanced approach would be to choose the best input mask // register based on context. - case X86::KSET0B: case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); - case X86::KSET1B: case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); @@ -6860,7 +7652,7 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, if (NumAddrOps < 4) { // FrameIndex only - add an immediate offset (whether its zero or not). for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); + MIB.add(MOs[i]); addOffset(MIB, PtrOffset); } else { // General Memory Addressing - we need to add any offset to an existing @@ -6871,7 +7663,7 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, if (i == 3 && PtrOffset != 0) { MIB.addDisp(MO, PtrOffset); } else { - MIB.addOperand(MO); + MIB.add(MO); } } } @@ -6893,11 +7685,11 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, unsigned NumOps = MI.getDesc().getNumOperands() - 2; for (unsigned i = 0; i != NumOps; ++i) { MachineOperand &MO = MI.getOperand(i + 2); - MIB.addOperand(MO); + MIB.add(MO); } for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) { MachineOperand &MO = MI.getOperand(i); - MIB.addOperand(MO); + MIB.add(MO); } MachineBasicBlock *MBB = InsertPt->getParent(); @@ -6922,7 +7714,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, assert(MO.isReg() && "Expected to fold into reg operand!"); addOperands(MIB, MOs, PtrOffset); } else { - MIB.addOperand(MO); + MIB.add(MO); } } @@ -7226,7 +8018,7 @@ static bool hasPartialRegUpdate(unsigned Opcode) { return false; } -/// Inform the ExeDepsFix pass how many idle +/// Inform the ExecutionDepsFix pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo::getPartialRegUpdateClearance( const MachineInstr &MI, unsigned OpNum, @@ -7344,11 +8136,15 @@ static bool hasUndefRegUpdate(unsigned Opcode) { case X86::VCVTUSI642SDZrrb_Int: case X86::VCVTUSI642SDZrm_Int: case X86::VCVTSD2SSZrr: - case X86::VCVTSD2SSZrrb: + case X86::VCVTSD2SSZrr_Int: + case X86::VCVTSD2SSZrrb_Int: case X86::VCVTSD2SSZrm: + case X86::VCVTSD2SSZrm_Int: case X86::VCVTSS2SDZrr: - case X86::VCVTSS2SDZrrb: + case X86::VCVTSS2SDZrr_Int: + case X86::VCVTSS2SDZrrb_Int: case X86::VCVTSS2SDZrm: + case X86::VCVTSS2SDZrm_Int: case X86::VRNDSCALESDr: case X86::VRNDSCALESDrb: case X86::VRNDSCALESDm: @@ -7375,8 +8171,8 @@ static bool hasUndefRegUpdate(unsigned Opcode) { return false; } -/// Inform the ExeDepsFix pass how many idle instructions we would like before -/// certain undef register reads. +/// Inform the ExecutionDepsFix pass how many idle instructions we would like +/// before certain undef register reads. /// /// This catches the VCVTSI2SD family of instructions: /// @@ -7522,6 +8318,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: + case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: + case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: + case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz: + case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz: + case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz: + case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz: case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int: case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int: @@ -7536,6 +8338,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int: case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int: case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int: + case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk: + case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk: + case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk: + case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk: + case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk: + case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk: + case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz: + case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz: + case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz: + case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz: + case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz: + case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz: return false; default: return true; @@ -7555,6 +8369,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: + case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: + case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: + case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz: + case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz: + case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz: + case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz: case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int: case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int: @@ -7569,6 +8389,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int: case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int: case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int: + case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk: + case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk: + case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk: + case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk: + case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk: + case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk: + case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz: + case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz: + case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz: + case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz: + case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz: + case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz: return false; default: return true; @@ -7800,11 +8632,11 @@ bool X86InstrInfo::unfoldMemoryOperand( if (FoldedStore) MIB.addReg(Reg, RegState::Define); for (MachineOperand &BeforeOp : BeforeOps) - MIB.addOperand(BeforeOp); + MIB.add(BeforeOp); if (FoldedLoad) MIB.addReg(Reg); for (MachineOperand &AfterOp : AfterOps) - MIB.addOperand(AfterOp); + MIB.add(AfterOp); for (MachineOperand &ImpOp : ImpOps) { MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) | @@ -8143,28 +8975,29 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, break; } - // Check if chain operands and base addresses match. - if (Load1->getOperand(0) != Load2->getOperand(0) || - Load1->getOperand(5) != Load2->getOperand(5)) + // Lambda to check if both the loads have the same value for an operand index. + auto HasSameOp = [&](int I) { + return Load1->getOperand(I) == Load2->getOperand(I); + }; + + // All operands except the displacement should match. + if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) || + !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg)) return false; - // Segment operands should match as well. - if (Load1->getOperand(4) != Load2->getOperand(4)) + + // Chain Operand must be the same. + if (!HasSameOp(5)) return false; - // Scale should be 1, Index should be Reg0. - if (Load1->getOperand(1) == Load2->getOperand(1) && - Load1->getOperand(2) == Load2->getOperand(2)) { - if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1) - return false; - // Now let's examine the displacements. - if (isa<ConstantSDNode>(Load1->getOperand(3)) && - isa<ConstantSDNode>(Load2->getOperand(3))) { - Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue(); - Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue(); - return true; - } - } - return false; + // Now let's examine if the displacements are constants. + auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp)); + auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp)); + if (!Disp1 || !Disp2) + return false; + + Offset1 = Disp1->getSExtValue(); + Offset2 = Disp2->getSExtValue(); + return true; } bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, @@ -8215,165 +9048,6 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } -bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const { - // Check if this processor supports macro-fusion. Since this is a minor - // heuristic, we haven't specifically reserved a feature. hasAVX is a decent - // proxy for SandyBridge+. - if (!Subtarget.hasAVX()) - return false; - - enum { - FuseTest, - FuseCmp, - FuseInc - } FuseKind; - - switch (Second.getOpcode()) { - default: - return false; - case X86::JE_1: - case X86::JNE_1: - case X86::JL_1: - case X86::JLE_1: - case X86::JG_1: - case X86::JGE_1: - FuseKind = FuseInc; - break; - case X86::JB_1: - case X86::JBE_1: - case X86::JA_1: - case X86::JAE_1: - FuseKind = FuseCmp; - break; - case X86::JS_1: - case X86::JNS_1: - case X86::JP_1: - case X86::JNP_1: - case X86::JO_1: - case X86::JNO_1: - FuseKind = FuseTest; - break; - } - switch (First.getOpcode()) { - default: - return false; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - case X86::TEST8ri: - case X86::TEST16ri: - case X86::TEST32ri: - case X86::TEST32i32: - case X86::TEST64i32: - case X86::TEST64ri32: - case X86::TEST8rm: - case X86::TEST16rm: - case X86::TEST32rm: - case X86::TEST64rm: - case X86::TEST8ri_NOREX: - case X86::AND16i16: - case X86::AND16ri: - case X86::AND16ri8: - case X86::AND16rm: - case X86::AND16rr: - case X86::AND32i32: - case X86::AND32ri: - case X86::AND32ri8: - case X86::AND32rm: - case X86::AND32rr: - case X86::AND64i32: - case X86::AND64ri32: - case X86::AND64ri8: - case X86::AND64rm: - case X86::AND64rr: - case X86::AND8i8: - case X86::AND8ri: - case X86::AND8rm: - case X86::AND8rr: - return true; - case X86::CMP16i16: - case X86::CMP16ri: - case X86::CMP16ri8: - case X86::CMP16rm: - case X86::CMP16rr: - case X86::CMP32i32: - case X86::CMP32ri: - case X86::CMP32ri8: - case X86::CMP32rm: - case X86::CMP32rr: - case X86::CMP64i32: - case X86::CMP64ri32: - case X86::CMP64ri8: - case X86::CMP64rm: - case X86::CMP64rr: - case X86::CMP8i8: - case X86::CMP8ri: - case X86::CMP8rm: - case X86::CMP8rr: - case X86::ADD16i16: - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri8_DB: - case X86::ADD16ri_DB: - case X86::ADD16rm: - case X86::ADD16rr: - case X86::ADD16rr_DB: - case X86::ADD32i32: - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD32ri8_DB: - case X86::ADD32ri_DB: - case X86::ADD32rm: - case X86::ADD32rr: - case X86::ADD32rr_DB: - case X86::ADD64i32: - case X86::ADD64ri32: - case X86::ADD64ri32_DB: - case X86::ADD64ri8: - case X86::ADD64ri8_DB: - case X86::ADD64rm: - case X86::ADD64rr: - case X86::ADD64rr_DB: - case X86::ADD8i8: - case X86::ADD8mi: - case X86::ADD8mr: - case X86::ADD8ri: - case X86::ADD8rm: - case X86::ADD8rr: - case X86::SUB16i16: - case X86::SUB16ri: - case X86::SUB16ri8: - case X86::SUB16rm: - case X86::SUB16rr: - case X86::SUB32i32: - case X86::SUB32ri: - case X86::SUB32ri8: - case X86::SUB32rm: - case X86::SUB32rr: - case X86::SUB64i32: - case X86::SUB64ri32: - case X86::SUB64ri8: - case X86::SUB64rm: - case X86::SUB64rr: - case X86::SUB8i8: - case X86::SUB8ri: - case X86::SUB8rm: - case X86::SUB8rr: - return FuseKind == FuseCmp || FuseKind == FuseInc; - case X86::INC16r: - case X86::INC32r: - case X86::INC64r: - case X86::INC8r: - case X86::DEC16r: - case X86::DEC32r: - case X86::DEC64r: - case X86::DEC8r: - return FuseKind == FuseInc; - } -} - bool X86InstrInfo:: reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); @@ -8424,6 +9098,7 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, + { X86::MOVSDmr, X86::MOVSDmr, X86::MOVPQI2QImr }, { X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr }, { X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm }, { X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm }, @@ -8443,6 +9118,7 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, { X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr }, + { X86::VMOVSDmr, X86::VMOVSDmr, X86::VMOVPQI2QImr }, { X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr }, { X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm }, { X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm }, @@ -8465,7 +9141,7 @@ static const uint16_t ReplaceableInstrs[][3] = { // AVX512 support { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr }, { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, - { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr }, { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr }, { X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr }, { X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr }, @@ -8493,10 +9169,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, - { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, - { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, - { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, - { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, @@ -8508,6 +9180,14 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 }, }; +static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, + { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, + { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, + { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, +}; + static const uint16_t ReplaceableInstrsAVX512[][4] = { // Two integer columns for 64-bit and 32-bit elements. //PackedSingle PackedDouble PackedInt PackedInt @@ -8769,16 +9449,25 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { validDomains = 0xe; } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) { validDomains = Subtarget.hasAVX2() ? 0xe : 0x6; + } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) { + // Insert/extract instructions should only effect domain if AVX2 + // is enabled. + if (!Subtarget.hasAVX2()) + return std::make_pair(0, 0); + validDomains = 0xe; } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) { validDomains = 0xe; - } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) { - validDomains = Subtarget.hasDQI() ? 0xe : 0x8; - } else if (const uint16_t *table = lookupAVX512(opcode, domain, + } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain, + ReplaceableInstrsAVX512DQ)) { + validDomains = 0xe; + } else if (Subtarget.hasDQI()) { + if (const uint16_t *table = lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) { - if (domain == 1 || (domain == 3 && table[3] == opcode)) - validDomains = Subtarget.hasDQI() ? 0xa : 0x8; - else - validDomains = Subtarget.hasDQI() ? 0xc : 0x8; + if (domain == 1 || (domain == 3 && table[3] == opcode)) + validDomains = 0xa; + else + validDomains = 0xc; + } } } return std::make_pair(domain, validDomains); @@ -8794,6 +9483,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { "256-bit vector operations only available in AVX2"); table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2); } + if (!table) { // try the other table + assert(Subtarget.hasAVX2() && + "256-bit insert/extract only available in AVX2"); + table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract); + } if (!table) { // try the AVX512 table assert(Subtarget.hasAVX512() && "Requires AVX-512"); table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512); @@ -9457,28 +10151,6 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } -bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const { - switch (Inst.getOpcode()) { - case X86::TCRETURNdi: - case X86::TCRETURNmi: - case X86::TCRETURNri: - case X86::TCRETURNdi64: - case X86::TCRETURNmi64: - case X86::TCRETURNri64: - case X86::TAILJMPd: - case X86::TAILJMPm: - case X86::TAILJMPr: - case X86::TAILJMPd64: - case X86::TAILJMPm64: - case X86::TAILJMPr64: - case X86::TAILJMPm64_REX: - case X86::TAILJMPr64_REX: - return true; - default: - return false; - } -} - namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. @@ -9665,3 +10337,124 @@ namespace { char LDTLSCleanup::ID = 0; FunctionPass* llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } + +unsigned X86InstrInfo::getOutliningBenefit(size_t SequenceSize, + size_t Occurrences, + bool CanBeTailCall) const { + unsigned NotOutlinedSize = SequenceSize * Occurrences; + unsigned OutlinedSize; + + // Is it a tail call? + if (CanBeTailCall) { + // If yes, we don't have to include a return instruction-- it's already in + // our sequence. So we have one occurrence of the sequence + #Occurrences + // calls. + OutlinedSize = SequenceSize + Occurrences; + } else { + // If not, add one for the return instruction. + OutlinedSize = (SequenceSize + 1) + Occurrences; + } + + // Return the number of instructions saved by outlining this sequence. + return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0; +} + +bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const { + return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); +} + +X86GenInstrInfo::MachineOutlinerInstrType +X86InstrInfo::getOutliningType(MachineInstr &MI) const { + + // Don't allow debug values to impact outlining type. + if (MI.isDebugValue() || MI.isIndirectDebugValue()) + return MachineOutlinerInstrType::Invisible; + + // Is this a tail call? If yes, we can outline as a tail call. + if (isTailCall(MI)) + return MachineOutlinerInstrType::Legal; + + // Is this the terminator of a basic block? + if (MI.isTerminator() || MI.isReturn()) { + + // Does its parent have any successors in its MachineFunction? + if (MI.getParent()->succ_empty()) + return MachineOutlinerInstrType::Legal; + + // It does, so we can't tail call it. + return MachineOutlinerInstrType::Illegal; + } + + // Don't outline anything that modifies or reads from the stack pointer. + // + // FIXME: There are instructions which are being manually built without + // explicit uses/defs so we also have to check the MCInstrDesc. We should be + // able to remove the extra checks once those are fixed up. For example, + // sometimes we might get something like %RAX<def> = POP64r 1. This won't be + // caught by modifiesRegister or readsRegister even though the instruction + // really ought to be formed so that modifiesRegister/readsRegister would + // catch it. + if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) || + MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) || + MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) + return MachineOutlinerInstrType::Illegal; + + // Outlined calls change the instruction pointer, so don't read from it. + if (MI.readsRegister(X86::RIP, &RI) || + MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) || + MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP)) + return MachineOutlinerInstrType::Illegal; + + // Positions can't safely be outlined. + if (MI.isPosition()) + return MachineOutlinerInstrType::Illegal; + + // Make sure none of the operands of this instruction do anything tricky. + for (const MachineOperand &MOP : MI.operands()) + if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || + MOP.isTargetIndex()) + return MachineOutlinerInstrType::Illegal; + + return MachineOutlinerInstrType::Legal; +} + +void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const { + + // If we're a tail call, we already have a return, so don't do anything. + if (IsTailCall) + return; + + // We're a normal call, so our sequence doesn't have a return instruction. + // Add it in. + MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ)); + MBB.insert(MBB.end(), retq); +} + +void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const { + return; +} + +MachineBasicBlock::iterator +X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &It, + MachineFunction &MF, + bool IsTailCall) const { + // Is it a tail call? + if (IsTailCall) { + // Yes, just insert a JMP. + It = MBB.insert(It, + BuildMI(MF, DebugLoc(), get(X86::JMP_1)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + } else { + // No, insert a call. + It = MBB.insert(It, + BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32)) + .addGlobalAddress(M.getNamedValue(MF.getName()))); + } + + return It; +} diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index acfdef4da7a3a..2fee48570ce17 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -182,6 +182,20 @@ public: /// const X86RegisterInfo &getRegisterInfo() const { return RI; } + /// Returns the stack pointer adjustment that happens inside the frame + /// setup..destroy sequence (e.g. by pushes, or inside the callee). + int64_t getFrameAdjustment(const MachineInstr &I) const { + assert(isFrameInstr(I)); + return I.getOperand(1).getImm(); + } + + /// Sets the stack pointer adjustment made inside the frame made up by this + /// instruction. + void setFrameAdjustment(MachineInstr &I, int64_t V) const { + assert(isFrameInstr(I)); + I.getOperand(1).setImm(V); + } + /// getSPAdjust - This returns the stack pointer adjustment made by /// this instruction. For x86, we need to handle more complex call /// sequences involving PUSHes. @@ -316,6 +330,13 @@ public: // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr &MI) const override; + bool isUnconditionalTailCall(const MachineInstr &MI) const override; + bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond, + const MachineInstr &TailCall) const override; + void replaceBranchWithTailCall(MachineBasicBlock &MBB, + SmallVectorImpl<MachineOperand> &Cond, + const MachineInstr &TailCall) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, @@ -436,9 +457,6 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool shouldScheduleAdjacent(const MachineInstr &First, - const MachineInstr &Second) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; bool @@ -539,8 +557,28 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableDirectMachineOperandTargetFlags() const override; - bool isTailCall(const MachineInstr &Inst) const override; + unsigned getOutliningBenefit(size_t SequenceSize, + size_t Occurrences, + bool CanBeTailCall) const override; + + bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override; + + llvm::X86GenInstrInfo::MachineOutlinerInstrType + getOutliningType(MachineInstr &MI) const override; + + void insertOutlinerEpilogue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool IsTailCall) const override; + + void insertOutlinerPrologue(MachineBasicBlock &MBB, + MachineFunction &MF, + bool isTailCall) const override; + MachineBasicBlock::iterator + insertOutlinedCall(Module &M, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &It, + MachineFunction &MF, + bool IsTailCall) const override; protected: /// Commutes the operands in the given instruction by changing the operands /// order and/or changing the instruction's opcode and/or the immediate value diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 38036715a25a3..e31d2769047b5 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -318,6 +318,7 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in { def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; } def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; } def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; } + def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; } def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; } } @@ -374,9 +375,10 @@ def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>; def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>; def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>; def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>; -def vy128xmem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256XOperand>; +def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>; def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>; def vy512mem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>; +def vz256xmem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>; def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>; // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead @@ -831,7 +833,6 @@ def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; -def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasTBM : Predicate<"Subtarget->hasTBM()">; @@ -848,8 +849,6 @@ def HasVBMI : Predicate<"Subtarget->hasVBMI()">, def HasIFMA : Predicate<"Subtarget->hasIFMA()">, AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">; def HasRTM : Predicate<"Subtarget->hasRTM()">; -def HasHLE : Predicate<"Subtarget->hasHLE()">; -def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; @@ -857,9 +856,11 @@ def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; +def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; +def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; @@ -895,6 +896,7 @@ def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; +def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// @@ -931,6 +933,15 @@ def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; +// FIXME: Ideally we would just replace the above i*immSExt* matchers with +// relocImm-based matchers, but then FastISel would be unable to use them. +def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{ + return isSExtRelocImm<8>(N); +}]>; +def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ + return isSExtRelocImm<32>(N); +}]>; + // If we have multiple users of an immediate, it's much smaller to reuse // the register, rather than encode the immediate in every instruction. // This has the risk of increasing register pressure from stretched live @@ -971,6 +982,13 @@ def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; +def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit // unsigned field. def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; @@ -1106,13 +1124,15 @@ def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG>, OpSize16; -def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], - IIC_POP_MEM>, OpSize16; def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, SchedRW +let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in { +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [], + IIC_POP_MEM>, OpSize16; def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>; -} // mayLoad, SchedRW +} // mayStore, mayLoad, WriteRMW let mayStore = 1, SchedRW = [WriteStore] in { def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[], @@ -1194,9 +1214,10 @@ def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; +} // mayLoad, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [], IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>; -} // mayLoad, SchedRW let mayStore = 1, SchedRW = [WriteStore] in { def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; @@ -1965,7 +1986,12 @@ def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, Requires<[In64BitMode]>; // Data16 instruction prefix -def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>, + Requires<[Not16BitMode]>; + +// Data instruction prefix +def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", []>, + Requires<[In16BitMode]>; // Repeat string operation instruction prefixes // These uses the DF flag in the EFLAGS register to inc or dec ECX @@ -2079,6 +2105,7 @@ def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>, Requires<[Not64BitMode]>; +let mayStore = 1 in def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>, Requires<[Not64BitMode]>; @@ -2448,8 +2475,19 @@ def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, //===----------------------------------------------------------------------===// // CLZERO Instruction // -let Uses = [EAX] in -def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB; +let SchedRW = [WriteSystem] in { + let Uses = [EAX] in + def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>, + TB, Requires<[HasCLZERO]>; + + let usesCustomInserter = 1 in { + def CLZERO : PseudoI<(outs), (ins i32mem:$src1), + [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>; + } +} // SchedRW + +def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>; +def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. @@ -2522,10 +2560,10 @@ let Predicates = [HasTBM] in { // Memory Instructions // +let Predicates = [HasCLFLUSHOPT] in def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD; def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD; -def PCOMMIT : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD; //===----------------------------------------------------------------------===// @@ -2977,7 +3015,7 @@ def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32me def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. -def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; +def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. def : InstAlias<"movq\t{$src, $dst|$dst, $src}", diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 0bb1068239835..dc3800ce381b0 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -294,6 +294,7 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), [(set VR64:$dst, (load_mmx addr:$src))], IIC_MMX_MOVQ_RM>; } // SchedRW + let SchedRW = [WriteStore] in def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", @@ -378,7 +379,6 @@ defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d, defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, MMX_PHADDSUBW>; - // -- Subtraction defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, MMX_INTALU_ITINS>; @@ -479,13 +479,6 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, MMX_SHIFT_ITINS>; -def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSRLWrm VR64:$src1, addr:$src2)>; -def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSRLDrm VR64:$src1, addr:$src2)>; -def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSRLQrm VR64:$src1, addr:$src2)>; - defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_mmx_psll_w, int_x86_mmx_pslli_w, MMX_SHIFT_ITINS>; @@ -496,13 +489,6 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_mmx_psll_q, int_x86_mmx_pslli_q, MMX_SHIFT_ITINS>; -def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSLLWrm VR64:$src1, addr:$src2)>; -def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSLLDrm VR64:$src1, addr:$src2)>; -def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSLLQrm VR64:$src1, addr:$src2)>; - defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_mmx_psra_w, int_x86_mmx_psrai_w, MMX_SHIFT_ITINS>; @@ -510,11 +496,6 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_mmx_psra_d, int_x86_mmx_psrai_d, MMX_SHIFT_ITINS>; -def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSRAWrm VR64:$src1, addr:$src2)>; -def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)), - (MMX_PSRADrm VR64:$src1, addr:$src2)>; - // Comparison Instructions defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, MMX_INTALU_ITINS>; @@ -576,9 +557,6 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, imm:$src2))], IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>; - - - // -- Conversion Instructions defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", @@ -639,7 +617,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), [(set GR32orGR64:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>; - // Low word of XMM to MMX. def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; @@ -670,6 +647,16 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), (MMX_MOVFR642Qrr FR64:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))), + (MMX_CVTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))), + (MMX_CVTTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (MMX_CVTPD2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (MMX_CVTTPD2PIirr VR128:$src)>; } - - diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td index 309f601d1fcee..104ba2a174db1 100644 --- a/lib/Target/X86/X86InstrMPX.td +++ b/lib/Target/X86/X86InstrMPX.td @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { +let mayLoad = 1 in { def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, Requires<[HasMPX, Not64BitMode]>; @@ -21,16 +22,19 @@ multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, Requires<[HasMPX, In64BitMode]>; } +} defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { +let mayLoad = 1 in { def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, In64BitMode]>; +} def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, Not64BitMode]>; @@ -45,16 +49,18 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD; def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX]>; +let mayLoad = 1 in { def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, In64BitMode]>; - +} def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX]>; +let mayStore = 1 in { def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, Not64BitMode]>; @@ -65,6 +71,8 @@ def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src), def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), "bndstx\t{$src, $dst|$dst, $src}", []>, PS, Requires<[HasMPX]>; +} +let mayLoad = 1 in def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), "bndldx\t{$src, $dst|$dst, $src}", []>, PS, Requires<[HasMPX]>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 1812d01711d16..e1bf28cbf6125 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -259,8 +259,8 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, - SDPatternOperator Int, RegisterClass RC, - string asm, Operand memopr, + SDPatternOperator OpNode, RegisterClass RC, + ValueType VT, string asm, Operand memopr, ComplexPattern mem_cpat, Domain d, OpndItins itins, bit Is2Addr = 1> { let isCodeGenOnly = 1, hasSideEffects = 0 in { @@ -268,14 +268,14 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in { !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>, + [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, Sched<[itins.Sched]>; let mayLoad = 1 in def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>, + [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -446,9 +446,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>; + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>; + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; } //===----------------------------------------------------------------------===// @@ -461,12 +461,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in { + isPseudo = 1, SchedRW = [WriteZero] in { def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4f32 immAllZerosV))]>; } -let Predicates = [NoVLX] in +let Predicates = [NoAVX512] in def : Pat<(v4i32 immAllZerosV), (V_SET0)>; @@ -475,7 +475,7 @@ def : Pat<(v4i32 immAllZerosV), (V_SET0)>; // at the rename stage without using any execution unit, so SET0PSY // and SET0PDY can be used for vector int instructions without penalty let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in { + isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllZerosV))]>; } @@ -491,7 +491,6 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, [(set VR256:$dst, (v8i32 immAllOnesV))]>; } - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Move FP Scalar Instructions // @@ -527,12 +526,12 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, // AVX defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, - VEX_4V, VEX_LIG; + VEX_4V, VEX_LIG, VEX_WIG; def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>, - VEX, VEX_LIG, Sched<[WriteStore]>; + VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG; // SSE1 & 2 let Constraints = "$src1 = $dst" in { defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, @@ -552,7 +551,7 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], - IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>; + IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG; def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], @@ -644,10 +643,6 @@ let Predicates = [UseAVX] in { (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; // 256-bit variants def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), @@ -738,10 +733,6 @@ let Predicates = [UseSSE2] in { (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold because @@ -786,29 +777,29 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in let Predicates = [HasAVX, NoVLX] in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, - PS, VEX; + PS, VEX, VEX_WIG; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD, VEX; + PD, VEX, VEX_WIG; defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, - PS, VEX; + PS, VEX, VEX_WIG; defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS>, - PD, VEX; + PD, VEX, VEX_WIG; defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, - PS, VEX, VEX_L; + PS, VEX, VEX_L, VEX_WIG; defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD, VEX, VEX_L; + PD, VEX, VEX_L, VEX_WIG; defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, - PS, VEX, VEX_L; + PS, VEX, VEX_L, VEX_WIG; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS>, - PD, VEX, VEX_L; + PD, VEX, VEX_L, VEX_WIG; } let Predicates = [UseSSE1] in { @@ -832,35 +823,35 @@ let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], - IIC_SSE_MOVA_P_MR>, VEX; + IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG; def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movapd\t{$src, $dst|$dst, $src}", [(alignedstore (v2f64 VR128:$src), addr:$dst)], - IIC_SSE_MOVA_P_MR>, VEX; + IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG; def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", [(store (v4f32 VR128:$src), addr:$dst)], - IIC_SSE_MOVU_P_MR>, VEX; + IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG; def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v2f64 VR128:$src), addr:$dst)], - IIC_SSE_MOVU_P_MR>, VEX; + IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG; def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], - IIC_SSE_MOVA_P_MR>, VEX, VEX_L; + IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG; def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movapd\t{$src, $dst|$dst, $src}", [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], - IIC_SSE_MOVA_P_MR>, VEX, VEX_L; + IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG; def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movups\t{$src, $dst|$dst, $src}", [(store (v8f32 VR256:$src), addr:$dst)], - IIC_SSE_MOVU_P_MR>, VEX, VEX_L; + IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG; def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v4f64 VR256:$src), addr:$dst)], - IIC_SSE_MOVU_P_MR>, VEX, VEX_L; + IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG; } // SchedRW // For disassembler @@ -869,35 +860,35 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX; + IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX; + IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX; + IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG; def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX; + IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG; def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movaps\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movapd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movups\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; } // Aliases to help the assembler pick two byte VEX encodings by swapping the @@ -955,24 +946,10 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, IIC_SSE_MOVU_P_RR>; } -// Use vmovaps/vmovups for AVX integer load/store. let Predicates = [HasAVX, NoVLX] in { - // 128-bit load/store - def : Pat<(alignedloadv2i64 addr:$src), - (VMOVAPSrm addr:$src)>; - def : Pat<(loadv2i64 addr:$src), - (VMOVUPSrm addr:$src)>; - - def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), - (VMOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), - (VMOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v2i64 VR128:$src), addr:$dst), - (VMOVUPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v4i32 VR128:$src), addr:$dst), - (VMOVUPSmr addr:$dst, VR128:$src)>; - - // 256-bit load/store + // 256-bit load/store need to use floating point load/store in case we don't + // have AVX2. Execution domain fixing will convert to integer if AVX2 is + // available and changing the domain is beneficial. def : Pat<(alignedloadv4i64 addr:$src), (VMOVAPSYrm addr:$src)>; def : Pat<(loadv4i64 addr:$src), @@ -981,10 +958,18 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v4i64 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v8i32 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16i16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v32i8 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; // Special patterns for storing subvector extracts of lower 128-bits // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr @@ -994,18 +979,6 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(alignedstore (v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), addr:$dst), (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v2i64 (extract_subvector - (v4i64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v8i32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v16i16 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v32i8 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; def : Pat<(store (v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), addr:$dst), @@ -1013,40 +986,6 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(store (v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), addr:$dst), (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v2i64 (extract_subvector - (v4i64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v8i32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v16i16 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v32i8 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; -} - -let Predicates = [HasAVX, NoVLX] in { - // 128-bit load/store - def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), - (VMOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), - (VMOVAPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v8i16 VR128:$src), addr:$dst), - (VMOVUPSmr addr:$dst, VR128:$src)>; - def : Pat<(store (v16i8 VR128:$src), addr:$dst), - (VMOVUPSmr addr:$dst, VR128:$src)>; - - // 256-bit load/store - def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), - (VMOVAPSYmr addr:$dst, VR256:$src)>; - def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), - (VMOVAPSYmr addr:$dst, VR256:$src)>; - def : Pat<(store (v16i16 VR256:$src), addr:$dst), - (VMOVUPSYmr addr:$dst, VR256:$src)>; - def : Pat<(store (v32i8 VR256:$src), addr:$dst), - (VMOVUPSYmr addr:$dst, VR256:$src)>; } // Use movaps / movups for SSE integer load / store (one byte shorter). @@ -1107,7 +1046,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, let Predicates = [UseAVX] in defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", - itin>, VEX_4V; + itin>, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, @@ -1126,12 +1065,12 @@ def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], - IIC_SSE_MOV_LH>, VEX; + IIC_SSE_MOV_LH>, VEX, VEX_WIG; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], - IIC_SSE_MOV_LH>, VEX; + IIC_SSE_MOV_LH>, VEX, VEX_WIG; }// UseAVX def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", @@ -1238,12 +1177,12 @@ def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), - (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), - (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG; } // UseAVX def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", @@ -1343,14 +1282,14 @@ let AddedComplexity = 20, Predicates = [UseAVX] in { [(set VR128:$dst, (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))], IIC_SSE_MOV_LH>, - VEX_4V, Sched<[WriteFShuffle]>; + VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG; def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))], IIC_SSE_MOV_LH>, - VEX_4V, Sched<[WriteFShuffle]>; + VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG; } let Constraints = "$src1 = $dst", AddedComplexity = 20 in { def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), @@ -1725,11 +1664,11 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - PS, VEX, Requires<[HasAVX, NoVLX]>; + PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, "vcvtdq2ps\t{$src, $dst|$dst, $src}", SSEPackedSingle, SSE_CVT_PS>, - PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>; + PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, "cvtdq2ps\t{$src, $dst|$dst, $src}", @@ -1777,20 +1716,21 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", // Convert scalar double to scalar single let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), - (ins FR64:$src1, FR64:$src2), + (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, - Sched<[WriteCvtF2F]>; + Sched<[WriteCvtF2F]>, VEX_WIG; let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), - (ins FR64:$src1, f64mem:$src2), + (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, - Sched<[WriteCvtF2FLd, ReadAfterLd]>; + Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG; } -def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, +def : Pat<(f32 (fpround FR64:$src)), + (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>, Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), @@ -1810,15 +1750,15 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>, - Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG, + Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, sse_load_f64:$src2))], - IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>, - Sched<[WriteCvtF2FLd, ReadAfterLd]>; + IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG, + Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, @@ -1842,30 +1782,30 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem, // SSE2 instructions with XS prefix let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), - (ins FR32:$src1, FR32:$src2), + (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, - Sched<[WriteCvtF2F]>; + Sched<[WriteCvtF2F]>, VEX_WIG; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), - (ins FR32:$src1, f32mem:$src2), + (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, - Sched<[WriteCvtF2FLd, ReadAfterLd]>; + Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG; } def : Pat<(f64 (fpextend FR32:$src)), - (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, Requires<[UseAVX, OptForSpeed]>; def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), @@ -1895,15 +1835,15 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], - IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>, - Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG, + Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))], - IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>, - Sched<[WriteCvtF2FLd, ReadAfterLd]>; + IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG, + Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -1999,22 +1939,22 @@ def : Pat<(v4f32 (X86Movss def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], - IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG; def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2dq_256 VR256:$src))], - IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))], - IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; + IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG; def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))], @@ -2035,7 +1975,7 @@ def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, - VEX, Sched<[WriteCvtF2I]>; + VEX, Sched<[WriteCvtF2I]>, VEX_WIG; // XMM only def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", @@ -2044,7 +1984,7 @@ def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, - Sched<[WriteCvtF2ILd]>; + Sched<[WriteCvtF2ILd]>, VEX_WIG; def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>; @@ -2053,12 +1993,12 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, - VEX, VEX_L, Sched<[WriteCvtF2I]>; + VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG; def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, - VEX, VEX_L, Sched<[WriteCvtF2ILd]>; + VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG; def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", @@ -2083,23 +2023,23 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (fp_to_sint (v4f32 VR128:$src))))], - IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>; + IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG; def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (fp_to_sint (loadv4f32 addr:$src))))], - IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>; + IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG; def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v8i32 (fp_to_sint (v8f32 VR256:$src))))], - IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG; def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v8i32 (fp_to_sint (loadv8f32 addr:$src))))], IIC_SSE_CVT_PS_RM>, VEX, VEX_L, - Sched<[WriteCvtF2ILd]>; + Sched<[WriteCvtF2ILd]>, VEX_WIG; } def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -2118,7 +2058,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvttp2si (v2f64 VR128:$src))))], - IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>; + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG; // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. @@ -2132,7 +2072,7 @@ def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))], - IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>; + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG; def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>; @@ -2142,12 +2082,12 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (fp_to_sint (v4f64 VR256:$src))))], - IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>; + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (fp_to_sint (loadv4f64 addr:$src))))], - IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>; + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG; } def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; @@ -2193,19 +2133,19 @@ let Predicates = [HasAVX, NoVLX] in { def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))], - IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG; def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))], - IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG; def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))], - IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG; } let Predicates = [UseSSE2] in { @@ -2225,30 +2165,30 @@ let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, - VEX, Sched<[WriteCvtI2FLd]>; + (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>, + VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, - VEX, Sched<[WriteCvtI2F]>; + VEX, Sched<[WriteCvtI2F]>, VEX_WIG; def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, - VEX, VEX_L, Sched<[WriteCvtI2FLd]>; + VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG; def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, - VEX, VEX_L, Sched<[WriteCvtI2F]>; + VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG; } let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))], + (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -2276,7 +2216,7 @@ let Predicates = [HasAVX, NoVLX] in def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))], - IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG; // XMM only def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", @@ -2285,7 +2225,7 @@ let Predicates = [HasAVX, NoVLX] in def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG; def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>; @@ -2294,11 +2234,11 @@ let Predicates = [HasAVX, NoVLX] in { def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (fpround VR256:$src))], - IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>; + IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))], - IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>; + IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG; } def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; @@ -2368,21 +2308,25 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, } } +let ExeDomain = SSEPackedSingle in defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG; + SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG; +let ExeDomain = SSEPackedDouble in defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare - XD, VEX_4V, VEX_LIG; + XD, VEX_4V, VEX_LIG, VEX_WIG; let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, i8immZExt3>, XS; + let ExeDomain = SSEPackedDouble in defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", @@ -2398,6 +2342,7 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, VR128:$src, immLeaf:$cc))], itins.rr>, Sched<[itins.Sched]>; +let mayLoad = 1 in def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, @@ -2408,18 +2353,22 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, let isCodeGenOnly = 1 in { // Aliases to match intrinsics which expect XMM operand(s). + let ExeDomain = SSEPackedSingle in defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", SSE_ALU_F32S, i8immZExt5, sse_load_f32>, XS, VEX_4V; + let ExeDomain = SSEPackedDouble in defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32 XD, VEX_4V; let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}", SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS; + let ExeDomain = SSEPackedDouble in defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $dst|$dst, $src}", SSE_ALU_F64S, i8immZExt3, sse_load_f64>, @@ -2437,6 +2386,7 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], IIC_SSE_COMIS_RR>, Sched<[WriteFAdd]>; +let mayLoad = 1 in def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), @@ -2454,6 +2404,7 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], IIC_SSE_COMIS_RR>, Sched<[WriteFAdd]>; +let mayLoad = 1 in def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), @@ -2464,26 +2415,26 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS, VEX, VEX_LIG; + "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD, VEX, VEX_LIG; + "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG; let Pattern = []<dag> in { defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss">, PS, VEX, VEX_LIG; + "comiss">, PS, VEX, VEX_LIG, VEX_WIG; defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd">, PD, VEX, VEX_LIG; + "comisd">, PD, VEX, VEX_LIG, VEX_WIG; } let isCodeGenOnly = 1 in { defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS, VEX; + sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG; defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD, VEX; + sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG; defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS, VEX; + sse_load_f32, "comiss">, PS, VEX, VEX_WIG; defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD, VEX; + sse_load_f64, "comisd">, PD, VEX, VEX_WIG; } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss">, PS; @@ -2512,18 +2463,19 @@ let Defs = [EFLAGS] in { // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, - Operand CC, Intrinsic Int, string asm, + Operand CC, ValueType VT, string asm, string asm_alt, Domain d, ImmLeaf immLeaf, PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> { let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], + [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))], itins.rr, d>, Sched<[WriteFAdd]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))], + [(set RC:$dst, + (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; @@ -2540,67 +2492,33 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, } } -defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V; -defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, + SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V; -defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, + SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L; -defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { - defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, + defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS; - defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, + defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD; } -let Predicates = [HasAVX] in { -def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), - (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)), - (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; -def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), - (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)), - (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; - -def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), - (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; -def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)), - (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; -def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), - (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; -def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)), - (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; -} - -let Predicates = [UseSSE1] in { -def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), - (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; -def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)), - (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; -} - -let Predicates = [UseSSE2] in { -def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), - (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; -def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)), - (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; -} - //===----------------------------------------------------------------------===// // SSE 1 & 2 - Shuffle Instructions //===----------------------------------------------------------------------===// @@ -2624,16 +2542,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, let Predicates = [HasAVX, NoVLX] in { defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv4f32, SSEPackedSingle>, PS, VEX_4V; + loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; + loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv2f64, SSEPackedDouble>, PD, VEX_4V; + loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; + loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2715,29 +2633,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, let Predicates = [HasAVX, NoVLX] in { defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V; + SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V; + SSEPackedDouble>, PD, VEX_4V, VEX_WIG; defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V; + SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V; + SSEPackedDouble>, PD, VEX_4V, VEX_WIG; defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V, VEX_L; + SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V, VEX_L; + SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V, VEX_L; + SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V, VEX_L; + SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; }// Predicates = [HasAVX, NoVLX] let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, @@ -2789,13 +2707,13 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, let Predicates = [HasAVX] in { defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", - SSEPackedSingle>, PS, VEX; + SSEPackedSingle>, PS, VEX, VEX_WIG; defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", - SSEPackedDouble>, PD, VEX; + SSEPackedDouble>, PD, VEX, VEX_WIG; defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", - SSEPackedSingle>, PS, VEX, VEX_L; + SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", - SSEPackedDouble>, PD, VEX, VEX_L; + SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; } defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", @@ -2839,7 +2757,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, OpndItins itins, bit IsCommutable = 0, Predicate prd> { let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, - VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; + VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, @@ -2848,7 +2766,7 @@ let Constraints = "$src1 = $dst" in let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT256, VR256, loadv4i64, i256mem, itins, - IsCommutable, 0>, VEX_4V, VEX_L; + IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; } // These are ordered here for pattern ordering requirements with the fp versions @@ -2876,7 +2794,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), (bc_v4i64 (v8f32 VR256:$src2))))], [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), - (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L; + (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG; defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, @@ -2884,14 +2802,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, (bc_v4i64 (v4f64 VR256:$src2))))], [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), (loadv4i64 addr:$src2)))], 0>, - PD, VEX_4V, VEX_L; + PD, VEX_4V, VEX_L, VEX_WIG; defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), (bc_v2i64 (v4f32 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V; + (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG; defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, @@ -2899,7 +2817,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, (bc_v2i64 (v2f64 VR128:$src2))))], [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)))], 0>, - PD, VEX_4V; + PD, VEX_4V, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -3065,17 +2983,17 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, - SSEPackedSingle, itins.s, 0>, PS, VEX_4V; + SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG; defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, v2f64, f128mem, loadv2f64, - SSEPackedDouble, itins.d, 0>, PD, VEX_4V; + SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG; defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, loadv8f32, - SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L; + SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG; defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, - SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; + SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -3092,10 +3010,10 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>, - XS, VEX_4V, VEX_LIG; + XS, VEX_4V, VEX_LIG, VEX_WIG; defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>, - XD, VEX_4V, VEX_LIG; + XD, VEX_4V, VEX_LIG, VEX_WIG; let Constraints = "$src1 = $dst" in { defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), @@ -3108,21 +3026,20 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, - SDPatternOperator IntSS, - SDPatternOperator IntSD, + SDPatternOperator OpNode, SizeItins itins> { - defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128, + defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, - SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG; - defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128, + SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; + defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, - SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG; + SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; let Constraints = "$src1 = $dst" in { - defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128, + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, SSEPackedSingle, itins.s>, XS; - defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128, + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, SSEPackedDouble, itins.d>, XD; } @@ -3131,29 +3048,23 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, // Binary Arithmetic instructions defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag, - SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>; defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>, basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>, - basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag, - SSE_MUL_ITINS_S>; + basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>; let isCommutable = 0 in { defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag, - SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>; defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>, basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag, - SSE_DIV_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>; defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss, - int_x86_sse2_max_sd, SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>; defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>, basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>, - basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss, - int_x86_sse2_min_sd, SSE_ALU_ITINS_S>; + basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>; } let isCodeGenOnly = 1 in { @@ -3400,7 +3311,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, Sched<[itins.Sched.Folded, ReadAfterLd]>, Requires<[target, OptForSize]>; - let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { + let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; @@ -3444,7 +3355,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in { + let isCodeGenOnly = 1, ExeDomain = d in { def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -3465,7 +3376,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, // which has a clobber before the rcp, vs. // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by - // the partial register store, either in ExeDepFix or with smarter RA. + // the partial register store, either in ExecutionDepsFix or with smarter RA. let Predicates = [UseAVX] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; @@ -3495,22 +3406,22 @@ let Predicates = prds in { !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], - itins.rr>, VEX, Sched<[itins.Sched]>; + itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))], - itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG; def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))], - itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))], - itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG; } def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3531,22 +3442,22 @@ let Predicates = [HasAVX] in { !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], - itins.rr>, VEX, Sched<[itins.Sched]>; + itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))], - itins.rm>, VEX, Sched<[itins.Sched.Folded]>; + itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG; def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))], - itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>; + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))], - itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>; + itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG; } def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3567,7 +3478,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, f32mem, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG; + SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -3579,7 +3490,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, f64mem, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, "SD">, - XD, VEX_4V, VEX_LIG; + XD, VEX_4V, VEX_LIG, VEX_WIG; } // Square root. @@ -3647,41 +3558,41 @@ def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), "movntps\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX; + IIC_SSE_MOVNT>, VEX, VEX_WIG; def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX; + IIC_SSE_MOVNT>, VEX, VEX_WIG; let ExeDomain = SSEPackedInt in def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), + (ins i128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX; + IIC_SSE_MOVNT>, VEX, VEX_WIG; def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntps\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v8f32 VR256:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX, VEX_L; + IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG; def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f64 VR256:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX, VEX_L; + IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG; let ExeDomain = SSEPackedInt in def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), + (ins i256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], - IIC_SSE_MOVNT>, VEX, VEX_L; + IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG; } def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), @@ -3797,20 +3708,18 @@ def : Pat<(X86MFence), (MFENCE)>; //===----------------------------------------------------------------------===// def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], - IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>; + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG; def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], - IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG; -let Predicates = [UseSSE1] in { def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], - IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], - IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; -} + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions @@ -3821,16 +3730,16 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions let hasSideEffects = 0, SchedRW = [WriteMove] in { def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, - VEX; + VEX, VEX_WIG; def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, - VEX, VEX_L; + VEX, VEX_L, VEX_WIG; def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, - VEX; + VEX, VEX_WIG; def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, - VEX, VEX_L; + VEX, VEX_L, VEX_WIG; } // For Disassembler @@ -3839,54 +3748,58 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, - VEX; + VEX, VEX_WIG; def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVA_P_RR>, VEX, VEX_L; + IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG; def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>, - VEX; + VEX, VEX_WIG; def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), "movdqu\t{$src, $dst|$dst, $src}", [], - IIC_SSE_MOVU_P_RR>, VEX, VEX_L; + IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG; } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { +let Predicates = [HasAVX,NoVLX] in def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, - VEX; + "movdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (alignedloadv2i64 addr:$src))], + IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG; def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, - VEX, VEX_L; -let Predicates = [HasAVX] in { - def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, - XS, VEX; - def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, - XS, VEX, VEX_L; -} + VEX, VEX_L, VEX_WIG; +let Predicates = [HasAVX,NoVLX] in +def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (loadv2i64 addr:$src))], + IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG; +def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>, + XS, VEX, VEX_L, VEX_WIG; } let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +let Predicates = [HasAVX,NoVLX] in def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, - VEX; + "movdqa\t{$src, $dst|$dst, $src}", + [(alignedstore (v2i64 VR128:$src), addr:$dst)], + IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG; def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, - VEX, VEX_L; -let Predicates = [HasAVX] in { + VEX, VEX_L, VEX_WIG; +let Predicates = [HasAVX,NoVLX] in def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, - XS, VEX; + "vmovdqu\t{$src, $dst|$dst, $src}", + [(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>, + XS, VEX, VEX_WIG; def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>, - XS, VEX, VEX_L; -} + XS, VEX, VEX_L, VEX_WIG; } let SchedRW = [WriteMove] in { @@ -3949,6 +3862,50 @@ def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>; +let Predicates = [HasAVX, NoVLX] in { + // Additional patterns for other integer sizes. + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; + + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr + def : Pat<(alignedstore (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(alignedstore (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + + def : Pat<(store (v2i64 (extract_subvector + (v4i64 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v4i32 (extract_subvector + (v8i32 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v8i16 (extract_subvector + (v16i16 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; + def : Pat<(store (v16i8 (extract_subvector + (v32i8 VR256:$src), (iPTR 0))), addr:$dst), + (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; +} + //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Arithmetic Instructions //===---------------------------------------------------------------------===// @@ -4037,12 +3994,12 @@ defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, - loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V; + loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, VR256, loadv4i64, i256mem, SSE_PMADD, - 0>, VEX_4V, VEX_L; + 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, memopv2i64, i128mem, SSE_PMADD>; @@ -4050,11 +4007,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>, - VEX_4V; + VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, memopv2i64, i128mem, SSE_INTALU_ITINS_P>; @@ -4062,11 +4019,11 @@ defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, let Predicates = [HasAVX, NoVLX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>, - VEX_4V; + VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX] in defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32, VR256, loadv4i64, i256mem, - SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L; + SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, memopv2i64, i128mem, SSE_INTMUL_ITINS_P>; @@ -4113,11 +4070,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), OpNode, OpNode2, VR128, DstVT128, SrcVT, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), OpNode, OpNode2, VR256, DstVT256, SrcVT, - loadv2i64, 0>, VEX_4V, VEX_L; + loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, VR128, DstVT128, SrcVT, memopv2i64>; @@ -4138,10 +4095,10 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, SDNode OpNode> { let Predicates = [HasAVX, NoVLX_Or_NoBWI] in defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, - VR128, v16i8, 0>, VEX_4V; + VR128, v16i8, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, - VR256, v32i8, 0>, VEX_4V, VEX_L; + VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>; } @@ -4202,7 +4159,7 @@ let Predicates = [HasAVX, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>; + IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, @@ -4210,7 +4167,7 @@ let Predicates = [HasAVX, prd] in { [(set VR128:$dst, (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, - Sched<[WriteShuffleLd]>; + Sched<[WriteShuffleLd]>, VEX_WIG; } let Predicates = [HasAVX2, prd] in { @@ -4220,7 +4177,7 @@ let Predicates = [HasAVX2, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>; + IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, @@ -4228,7 +4185,7 @@ let Predicates = [HasAVX2, prd] in { [(set VR256:$dst, (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, - Sched<[WriteShuffleLd]>; + Sched<[WriteShuffleLd]>, VEX_WIG; } let Predicates = [UseSSE2] in { @@ -4257,20 +4214,6 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, NoVLX_Or_NoBWI>, XD; -let Predicates = [HasAVX] in { - def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))), - (VPSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (VPSHUFDri VR128:$src1, imm:$imm)>; -} - -let Predicates = [UseSSE2] in { - def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))), - (PSHUFDmi addr:$src1, imm:$imm)>; - def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), - (PSHUFDri VR128:$src1, imm:$imm)>; -} - //===---------------------------------------------------------------------===// // Packed Integer Pack Instructions (SSE & AVX) //===---------------------------------------------------------------------===// @@ -4364,24 +4307,24 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, loadv2i64, 0>, VEX_4V; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>, VEX_4V, VEX_L; } @@ -4443,44 +4386,44 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, - loadv2i64, 0>, VEX_4V; + loadv2i64, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -4565,14 +4508,14 @@ def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))], - IIC_SSE_MOVMSK>, VEX; + IIC_SSE_MOVMSK>, VEX, VEX_WIG; let Predicates = [HasAVX2] in { def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR256:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, - VEX, VEX_L; + VEX, VEX_L, VEX_WIG; } def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), @@ -4593,13 +4536,13 @@ def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)], - IIC_SSE_MASKMOV>, VEX; + IIC_SSE_MASKMOV>, VEX, VEX_WIG; let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)], - IIC_SSE_MASKMOV>, VEX; + IIC_SSE_MASKMOV>, VEX, VEX_WIG; let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -4725,19 +4668,6 @@ def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt - -def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - -def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>; - -def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), - (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>; - //===---------------------------------------------------------------------===// // Move Packed Doubleword Int first element to Doubleword Int // @@ -4758,12 +4688,12 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), } //SchedRW let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs), +def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), +def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt @@ -4837,6 +4767,8 @@ let Predicates = [UseAVX] in { // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. let AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), @@ -4866,6 +4798,8 @@ let Predicates = [UseSSE2] in { (MOV64toPQIrr GR64:$src)>; } let AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), @@ -4903,7 +4837,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, - VEX, Requires<[UseAVX]>; + VEX, Requires<[UseAVX]>, VEX_WIG; def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -4920,7 +4854,7 @@ def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], - IIC_SSE_MOVDQ>, VEX; + IIC_SSE_MOVDQ>, VEX, VEX_WIG; def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (extractelt (v2i64 VR128:$src), @@ -4932,7 +4866,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, SchedRW = [WriteVecLogic] in { def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), - "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX; + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG; def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>; } @@ -4978,7 +4912,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))], IIC_SSE_MOVQ_RR>, - XS, VEX, Requires<[UseAVX]>; + XS, VEX, Requires<[UseAVX]>, VEX_WIG; let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", @@ -5016,13 +4950,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), let Predicates = [HasAVX, NoVLX] in { defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v4f32, VR128, loadv4f32, f128mem>, VEX; + v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG; defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v4f32, VR128, loadv4f32, f128mem>, VEX; + v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG; defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; + v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG; defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L; + v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG; } defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, memopv4f32, f128mem>; @@ -5090,8 +5024,8 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), } let Predicates = [HasAVX, NoVLX] in { - defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; - defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; + defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; @@ -5108,16 +5042,6 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDDUPYrr VR256:$src)>; } -let Predicates = [HasAVX] in { - def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), - (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), - (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - def : Pat<(X86Movddup (bc_v2f64 - (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; -} - let Predicates = [HasAVX, NoVLX] in def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPrm addr:$src)>; @@ -5128,13 +5052,6 @@ def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), - (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), - (MOVDDUPrm addr:$src)>; - def : Pat<(X86Movddup (bc_v2f64 - (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (MOVDDUPrm addr:$src)>; } //===---------------------------------------------------------------------===// @@ -5145,11 +5062,11 @@ let SchedRW = [WriteLoad] in { let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG; def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, - VEX, VEX_L; + VEX, VEX_L, VEX_WIG; } def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "lddqu\t{$src, $dst|$dst, $src}", @@ -5183,15 +5100,15 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V; + f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG; defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG; } let ExeDomain = SSEPackedDouble in { defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V; + f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG; defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L; + f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG; } } let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { @@ -5278,23 +5195,23 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, - X86fhadd, loadv4f32, 0>, VEX_4V; + X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG; defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, - X86fhsub, loadv4f32, 0>, VEX_4V; + X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG; defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L; + X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L; + X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; } let ExeDomain = SSEPackedDouble in { defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, - X86fhadd, loadv2f64, 0>, VEX_4V; + X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG; defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, - X86fhsub, loadv2f64, 0>, VEX_4V; + X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG; defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L; + X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L; + X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; } } @@ -5352,84 +5269,24 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, Sched<[WriteVecALULd]>; } -// Helper fragments to match sext vXi1 to vXiY. -def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)), - VR128:$src))>; -def v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>; -def v4i1sextv4i32 : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>; -def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)), - VR256:$src))>; -def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>; -def v8i1sextv8i32 : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>; - -let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX; - defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX; -} -let Predicates = [HasAVX, NoVLX] in { - defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX; -} - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(xor - (bc_v2i64 (v16i1sextv16i8)), - (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), - (VPABSBrr VR128:$src)>; - def : Pat<(xor - (bc_v2i64 (v8i1sextv8i16)), - (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), - (VPABSWrr VR128:$src)>; + defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG; + defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { - def : Pat<(xor - (bc_v2i64 (v4i1sextv4i32)), - (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), - (VPABSDrr VR128:$src)>; -} - -let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L; - defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L; -} -let Predicates = [HasAVX2, NoVLX] in { - defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L; + defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG; } - let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - def : Pat<(xor - (bc_v4i64 (v32i1sextv32i8)), - (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))), - (VPABSBYrr VR256:$src)>; - def : Pat<(xor - (bc_v4i64 (v16i1sextv16i16)), - (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))), - (VPABSWYrr VR256:$src)>; + defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG; + defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { - def : Pat<(xor - (bc_v4i64 (v8i1sextv8i32)), - (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))), - (VPABSDYrr VR256:$src)>; + defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG; } -defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>; -defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>; -defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>; - -let Predicates = [UseSSSE3] in { - def : Pat<(xor - (bc_v2i64 (v16i1sextv16i8)), - (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))), - (PABSBrr VR128:$src)>; - def : Pat<(xor - (bc_v2i64 (v8i1sextv8i16)), - (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))), - (PABSWrr VR128:$src)>; - def : Pat<(xor - (bc_v2i64 (v4i1sextv4i32)), - (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))), - (PABSDrr VR128:$src)>; -} +defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>; +defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>; +defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>; //===---------------------------------------------------------------------===// // SSSE3 - Packed Binary Operator Instructions @@ -5527,45 +5384,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { let isCommutable = 0 in { defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, VR128, loadv2i64, i128mem, - SSE_PSHUFB, 0>, VEX_4V; + SSE_PSHUFB, 0>, VEX_4V, VEX_WIG; defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, v16i8, VR128, loadv2i64, i128mem, - SSE_PMADD, 0>, VEX_4V; + SSE_PMADD, 0>, VEX_4V, VEX_WIG; } defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, VR128, loadv2i64, i128mem, - SSE_PMULHRSW, 0>, VEX_4V; + SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG; } let ImmT = NoImm, Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, loadv2i64, i128mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG; defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, loadv2i64, i128mem, - SSE_PHADDSUBD, 0>, VEX_4V; + SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG; defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, loadv2i64, i128mem, - SSE_PHADDSUBW, 0>, VEX_4V; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG; defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, loadv2i64, i128mem, SSE_PHADDSUBD, 0>, VEX_4V; defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", int_x86_ssse3_psign_b_128, - SSE_PSIGN, loadv2i64, 0>, VEX_4V; + SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", int_x86_ssse3_psign_w_128, - SSE_PSIGN, loadv2i64, 0>, VEX_4V; + SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", int_x86_ssse3_psign_d_128, - SSE_PSIGN, loadv2i64, 0>, VEX_4V; + SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", int_x86_ssse3_phadd_sw_128, - SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", int_x86_ssse3_phsub_sw_128, - SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V; + SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG; } } @@ -5573,42 +5430,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { let isCommutable = 0 in { defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, VR256, loadv4i64, i256mem, - SSE_PSHUFB, 0>, VEX_4V, VEX_L; + SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, v32i8, VR256, loadv4i64, i256mem, - SSE_PMADD, 0>, VEX_4V, VEX_L; + SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG; } defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, VR256, loadv4i64, i256mem, - SSE_PMULHRSW, 0>, VEX_4V, VEX_L; + SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG; } let ImmT = NoImm, Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, VR256, loadv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, loadv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, VR256, loadv4i64, i256mem, - SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; + SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, loadv4i64, i256mem, SSE_PHADDSUBW, 0>, VEX_4V, VEX_L; defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, - WriteVecALU>, VEX_4V, VEX_L; + WriteVecALU>, VEX_4V, VEX_L, VEX_WIG; defm VPSIGNWY : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, - WriteVecALU>, VEX_4V, VEX_L; + WriteVecALU>, VEX_4V, VEX_L, VEX_WIG; defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, - WriteVecALU>, VEX_4V, VEX_L; + WriteVecALU>, VEX_4V, VEX_L, VEX_WIG; defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", int_x86_avx2_phadd_sw, - WriteVecALU>, VEX_4V, VEX_L; + WriteVecALU>, VEX_4V, VEX_L, VEX_WIG; defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", int_x86_avx2_phsub_sw, - WriteVecALU>, VEX_4V, VEX_L; + WriteVecALU>, VEX_4V, VEX_L, VEX_WIG; } } @@ -5686,9 +5543,9 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { } let Predicates = [HasAVX] in - defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V; + defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2] in - defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L; + defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGNR : ssse3_palignr<"palignr">; @@ -5779,10 +5636,10 @@ multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; let Predicates = [HasAVX, prd] in defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, - VR128, VR128, AVXItins>, VEX; + VR128, VR128, AVXItins>, VEX, VEX_WIG; let Predicates = [HasAVX2, prd] in defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, - VR256, VR128, AVX2Itins>, VEX, VEX_L; + VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG; } multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, @@ -6010,12 +5867,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, } } -defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; -defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; +defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>; +defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>; let Predicates = [UseSSE41] in { - defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>; } //===----------------------------------------------------------------------===// @@ -6103,20 +5960,20 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set GR64:$dst, (extractelt (v2i64 VR128:$src1), imm:$src2))]>, - Sched<[WriteShuffle]>, REX_W; + Sched<[WriteShuffle]>; let SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(store (extractelt (v2i64 VR128:$src1), imm:$src2), - addr:$dst)]>, REX_W; + addr:$dst)]>; } let Predicates = [HasAVX, NoDQI] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; -defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; +defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory /// destination @@ -6140,7 +5997,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr, let ExeDomain = SSEPackedSingle in { let Predicates = [UseAVX] in - defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; + defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>; } @@ -6268,7 +6125,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, let ExeDomain = SSEPackedSingle in { let Predicates = [UseAVX] in - defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; + defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG; let Constraints = "$src1 = $dst" in defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } @@ -6461,14 +6318,14 @@ let Predicates = [HasAVX] in { defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, loadv4f32, loadv2f64, int_x86_sse41_round_ps, - int_x86_sse41_round_pd>, VEX; + int_x86_sse41_round_pd>, VEX, VEX_WIG; defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, loadv8f32, loadv4f64, int_x86_avx_round_ps_256, - int_x86_avx_round_pd_256>, VEX, VEX_L; + int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG; defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", int_x86_sse41_round_ss, - int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG; defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; } @@ -6606,20 +6463,20 @@ let Defs = [EFLAGS], Predicates = [HasAVX] in { def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, - Sched<[WriteVecLogic]>, VEX; + Sched<[WriteVecLogic]>, VEX, VEX_WIG; def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, - Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX; + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG; def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, - Sched<[WriteVecLogic]>, VEX, VEX_L; + Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG; def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, - Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L; + Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG; } let Defs = [EFLAGS] in { @@ -6722,7 +6579,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", int_x86_sse41_phminposuw, loadv2i64, - WriteVecIMul>, VEX; + WriteVecIMul>, VEX, VEX_WIG; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", int_x86_sse41_phminposuw, memopv2i64, WriteVecIMul>; @@ -6778,65 +6635,65 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX, NoVLX] in { defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32, VR128, loadv2i64, i128mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32, VR256, loadv4i64, i256mem, - SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L; + SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -6864,18 +6721,18 @@ let Constraints = "$src1 = $dst" in { let Predicates = [HasAVX, NoVLX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>, - VEX_4V; + VEX_4V, VEX_WIG; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V; + VEX_4V, VEX_WIG; } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, - VEX_4V, VEX_L; + VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -6945,52 +6802,52 @@ let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_MPSADSCHED>, VEX_4V; + DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG; } let ExeDomain = SSEPackedSingle in { defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG; defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, VR256, loadv8f32, f256mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; } let ExeDomain = SSEPackedDouble in { defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG; defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, VR256, loadv4f64, f256mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L; + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; } defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V; + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, - SSE_DPPS_ITINS>, VEX_4V; + SSE_DPPS_ITINS>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedDouble in defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, VR128, loadv2f64, f128mem, 0, - SSE_DPPS_ITINS>, VEX_4V; + SSE_DPPS_ITINS>, VEX_4V, VEX_WIG; let ExeDomain = SSEPackedSingle in defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, VR256, loadv8f32, i256mem, 0, - SSE_DPPS_ITINS>, VEX_4V, VEX_L; + SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG; } let Predicates = [HasAVX2] in { let isCommutable = 0 in { defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; + DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG; } defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -7020,6 +6877,19 @@ let Constraints = "$src1 = $dst" in { SSE_DPPD_ITINS>; } +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +let Predicates = [HasAVX] in { +def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), + (VBLENDPDYrri VR256:$src1, + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0x3)>; +def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, @@ -7165,14 +7035,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, - "\t{$src2, $dst|$dst, $src2}"), + "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], itins.rr>, Sched<[itins.Sched]>; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), !strconcat(OpcodeStr, - "\t{$src2, $dst|$dst, $src2}"), + "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (mem_frag addr:$src2)), XMM0))], @@ -7193,18 +7063,18 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, DEFAULT_ITINS_VARBLENDSCHED>; // Aliases with the implicit xmm0 argument -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (BLENDVPDrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (BLENDVPSrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (PBLENDVBrr0 VR128:$dst, VR128:$src2)>; -def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>; +def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", + (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", + (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; +def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", + (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", + (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; +def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", + (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", + (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; let Predicates = [UseSSE41] in { def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), @@ -7228,17 +7098,14 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteLoad] in { let Predicates = [HasAVX, NoVLX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, - VEX; + "vmovntdqa\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_WIG; let Predicates = [HasAVX2, NoVLX] in def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, - VEX, VEX_L; + "vmovntdqa\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_L, VEX_WIG; def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movntdqa\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; + "movntdqa\t{$src, $dst|$dst, $src}", []>; } // SchedRW let Predicates = [HasAVX2, NoVLX] in { @@ -7295,11 +7162,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, - loadv2i64, i128mem, 0>, VEX_4V; + loadv2i64, i128mem, 0>, VEX_4V, VEX_WIG; let Predicates = [HasAVX2] in defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, - loadv4i64, i256mem, 0>, VEX_4V, VEX_L; + loadv4i64, i256mem, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst" in defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, @@ -7323,7 +7190,7 @@ multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> { let Defs = [EFLAGS], usesCustomInserter = 1 in { defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, - Requires<[HasAVX]>; + Requires<[HasAVX]>, VEX_WIG; defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>, Requires<[UseSSE42]>; } @@ -7397,7 +7264,7 @@ multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> { let Defs = [EFLAGS], usesCustomInserter = 1 in { defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, - Requires<[HasAVX]>; + Requires<[HasAVX]>, VEX_WIG; defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>, Requires<[UseSSE42]>; } @@ -7515,14 +7382,18 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, bit UsesXMM0 = 0> { def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !if(UsesXMM0, + !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), [!if(UsesXMM0, (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8; def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !if(UsesXMM0, + !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), [!if(UsesXMM0, (set VR128:$dst, (IntId VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), @@ -7557,10 +7428,10 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { } // Aliases with explicit %xmm0 -def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (SHA256RNDS2rr VR128:$dst, VR128:$src2)>; -def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", - (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>; +def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", + (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", + (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; //===----------------------------------------------------------------------===// // AES-NI Instructions @@ -7588,13 +7459,13 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, // Perform One Round of an AES Encryption/Decryption Flow let Predicates = [HasAVX, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V; + int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V; + int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V; + int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG; defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V; + int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -7615,12 +7486,12 @@ let Predicates = [HasAVX, HasAES] in { "vaesimc\t{$src1, $dst|$dst, $src1}", [(set VR128:$dst, (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, - VEX; + VEX, VEX_WIG; def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, - Sched<[WriteAESIMCLd]>, VEX; + Sched<[WriteAESIMCLd]>, VEX, VEX_WIG; } def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1), @@ -7640,13 +7511,13 @@ let Predicates = [HasAVX, HasAES] in { "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, - Sched<[WriteAESKeyGen]>, VEX; + Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, - Sched<[WriteAESKeyGenLd]>, VEX; + Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -7672,14 +7543,14 @@ def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, - Sched<[WriteCLMul]>; + Sched<[WriteCLMul]>, VEX_WIG; def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (loadv2i64 addr:$src2), imm:$src3))]>, - Sched<[WriteCLMulLd, ReadAfterLd]>; + Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG; // Carry-less Multiplication instructions let Constraints = "$src1 = $dst" in { @@ -7879,6 +7750,15 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } + +// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit +// all ones value. +let Predicates = [HasAVX1Only] in +def : Pat<(v8i32 immAllOnesV), + (VINSERTF128rr + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm), + (V_SETALLONES), 1)>; + multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, PatFrag memop_frag> { def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), @@ -8029,41 +7909,6 @@ let ExeDomain = SSEPackedDouble in { loadv4i64, v4f64, v4i64>, VEX_L; } -let Predicates = [HasAVX, NoVLX] in { -def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), - (VPERMILPSYrr VR256:$src1, VR256:$src2)>; -def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), - (VPERMILPSYrm VR256:$src1, addr:$src2)>; -def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), - (VPERMILPDYrr VR256:$src1, VR256:$src2)>; -def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), - (VPERMILPDYrm VR256:$src1, addr:$src2)>; - -def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), - (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), - (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), - (i8 imm:$imm))), - (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), - (VPERMILPDYmi addr:$src1, imm:$imm)>; - -def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), - (VPERMILPSrr VR128:$src1, VR128:$src2)>; -def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), - (VPERMILPSrm VR128:$src1, addr:$src2)>; -def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), - (VPERMILPDrr VR128:$src1, VR128:$src2)>; -def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), - (VPERMILPDrm VR128:$src1, addr:$src2)>; - -def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), - (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), - (VPERMILPDmi addr:$src1, imm:$imm)>; -} - //===----------------------------------------------------------------------===// // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks // @@ -8118,15 +7963,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // +// Note, these instruction do not affect the YMM16-YMM31. let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { // Zero All YMM registers def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", - [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>; + [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG; // Zero Upper bits of YMM registers def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", - [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>; + [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG; } //===----------------------------------------------------------------------===// @@ -8235,6 +8081,46 @@ defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, VR256, loadv4i64, i256mem>, VEX_L; +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +let Predicates = [HasAVX2] in { +def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the // destination operand @@ -8282,6 +8168,11 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + (VPBROADCASTQYrm addr:$src)>; // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. // This means we'll encounter truncated i32 loads; match that here. def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), @@ -8296,7 +8187,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { (VPBROADCASTWYrm addr:$src)>; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { // Provide aliases for broadcast from the same register class that // automatically does the extract. def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), @@ -8343,18 +8234,13 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>; + (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), - (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; - def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; - - // The patterns for VPBROADCASTD are not needed because they would match - // the exact same thing as VBROADCASTSS patterns. - + (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; def : Pat<(v2i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; - // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. + (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; } // AVX1 broadcast patterns @@ -8377,15 +8263,15 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [HasAVX1Only] in { def : Pat<(v4f32 (X86VBroadcast FR32:$src)), - (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; + (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>; def : Pat<(v8f32 (X86VBroadcast FR32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), - (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), - (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; + (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm), + (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), - (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm), - (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>; + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>; @@ -8399,7 +8285,7 @@ let Predicates = [HasAVX1Only] in { (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; def : Pat<(v2i64 (X86VBroadcast i64:$src)), - (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>; } //===----------------------------------------------------------------------===// @@ -8407,7 +8293,8 @@ let Predicates = [HasAVX1Only] in { // multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT, X86FoldableSchedWrite Sched> { + ValueType OpVT, X86FoldableSchedWrite Sched, + X86MemOperand memOp> { let Predicates = [HasAVX2, NoVLX] in { def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), @@ -8417,7 +8304,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, Sched<[Sched]>, VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i256mem:$src2), + (ins VR256:$src1, memOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, @@ -8427,12 +8314,15 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, } } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256, + i256mem>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256, + f256mem>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT, X86FoldableSchedWrite Sched> { + ValueType OpVT, X86FoldableSchedWrite Sched, + X86MemOperand memOp> { let Predicates = [HasAVX2, NoVLX] in { def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), @@ -8442,7 +8332,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), - (ins i256mem:$src1, u8imm:$src2), + (ins memOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, @@ -8453,10 +8343,10 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, } defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, - WriteShuffle256>, VEX_W; + WriteShuffle256, i256mem>, VEX_W; let ExeDomain = SSEPackedDouble in defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, - WriteFShuffle256>, VEX_W; + WriteFShuffle256, f256mem>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index e2be73532157e..0efb383e1c8d4 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -340,75 +340,71 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), let hasSideEffects = 0 in { let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { + +let Uses = [CL, EFLAGS] in { +def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; +def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; +def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +} // Uses = [CL, EFLAGS] + +let Uses = [EFLAGS] in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t$dst", [], IIC_SR>; def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; -let Uses = [CL] in -def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), - "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; - def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; -let Uses = [CL] in -def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), - "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; - def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; -let Uses = [CL] in -def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), - "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; - - def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "rcl{q}\t$dst", [], IIC_SR>; def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; -let Uses = [CL] in -def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), - "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +} // Uses = [EFLAGS] +let Uses = [CL, EFLAGS] in { +def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; +def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; +def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +} // Uses = [CL, EFLAGS] +let Uses = [EFLAGS] in { def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), "rcr{b}\t$dst", [], IIC_SR>; def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; -let Uses = [CL] in -def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), - "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; - def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; -let Uses = [CL] in -def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), - "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16; - def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; -let Uses = [CL] in -def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), - "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; - def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; -let Uses = [CL] in -def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), - "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; +} // Uses = [EFLAGS] } // Constraints = "$src = $dst" -let SchedRW = [WriteShiftLd, WriteRMW] in { +let SchedRW = [WriteShiftLd, WriteRMW], mayStore = 1 in { +let Uses = [EFLAGS] in { def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), "rcl{b}\t$dst", [], IIC_SR>; def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt), @@ -442,8 +438,9 @@ def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), "rcr{q}\t$dst", [], IIC_SR>; def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; +} // Uses = [EFLAGS] -let Uses = [CL] in { +let Uses = [CL, EFLAGS] in { def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), @@ -461,7 +458,7 @@ def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; -} +} // Uses = [CL, EFLAGS] } // SchedRW } // hasSideEffects = 0 @@ -665,19 +662,19 @@ def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), // Rotate by 1 def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), "ror{b}\t$dst", - [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)], + [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)], IIC_SR>; def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t$dst", - [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)], + [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)], IIC_SR>, OpSize16; def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t$dst", - [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)], + [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)], IIC_SR>, OpSize32; def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t$dst", - [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)], + [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)], IIC_SR>; } // SchedRW @@ -849,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem, } // Defs = [EFLAGS] +// Sandy Bridge and newer Intel processors support faster rotates using +// SHLD to avoid a partial flag update on the normal rotate instructions. +let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>; +} + def ROT32L2R_imm8 : SDNodeXForm<imm, [{ // Convert a ROTL shamt to a ROTR shamt on 32-bit integer. return getI8Imm(32 - N->getZExtValue(), SDLoc(N)); diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 9265d64b3230f..2e5350ce979e3 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -173,27 +173,28 @@ def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32; def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; - +let mayStore = 1 in { def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16; def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32; def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; - +} def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16; def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32; def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; - +let mayLoad = 1 in { def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16; def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32; def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; +} } // SchedRW //===----------------------------------------------------------------------===// @@ -202,6 +203,7 @@ def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), let SchedRW = [WriteSystem] in { def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; +let mayLoad = 1 in def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize16; @@ -210,6 +212,7 @@ def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), OpSize16; // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. +let mayLoad = 1 in def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize32; @@ -217,23 +220,27 @@ def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize32; // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. +let mayLoad = 1 in def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; +let mayLoad = 1 in def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize16; def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize16; +let mayLoad = 1 in def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize32; def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize32; +let mayLoad = 1 in def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), @@ -248,11 +255,13 @@ def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), "str{l}\t$dst", [], IIC_STR>, TB, OpSize32; def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), "str{q}\t$dst", [], IIC_STR>, TB; +let mayStore = 1 in def STRm : I<0x00, MRM1m, (outs), (ins i16mem:$dst), "str{w}\t$dst", [], IIC_STR>, TB; def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; +let mayLoad = 1 in def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; @@ -377,12 +386,14 @@ def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), "verr\t$seg", [], IIC_VERR>, TB; -def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), - "verr\t$seg", [], IIC_VERR>, TB; def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), "verw\t$seg", [], IIC_VERW_MEM>, TB; +let mayLoad = 1 in { +def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), + "verr\t$seg", [], IIC_VERR>, TB; def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), "verw\t$seg", [], IIC_VERW_REG>, TB; +} } // SchedRW //===----------------------------------------------------------------------===// @@ -403,6 +414,7 @@ def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst), "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>; def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins), "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16; +let mayStore = 1 in def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst), "sldt{w}\t$dst", [], IIC_SLDT>, TB; def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), @@ -412,6 +424,7 @@ def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), // extension. def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), "sldt{q}\t$dst", [], IIC_SLDT>, TB; +let mayStore = 1 in def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst), "sldt{q}\t$dst", [], IIC_SLDT>, TB; @@ -429,6 +442,7 @@ def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src), "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>; def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), "lldt{w}\t$src", [], IIC_LLDT_REG>, TB; +let mayLoad = 1 in def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB; } // SchedRW @@ -459,6 +473,7 @@ def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst), def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src), "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB; +let mayLoad = 1 in def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 7267d752653e3..38ac8be944832 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -25,9 +25,9 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), let isBranch = 1, isTerminator = 1, Defs = [EAX] in { def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst), - "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>; + "xbegin\t$dst", []>, OpSize16; def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), - "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>; + "xbegin\t$dst", []>, OpSize32; } def XEND : I<0x01, MRM_D5, (outs), (ins), @@ -35,7 +35,7 @@ def XEND : I<0x01, MRM_D5, (outs), (ins), let Defs = [EFLAGS] in def XTEST : I<0x01, MRM_D6, (outs), (ins), - "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>; + "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>; def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", @@ -44,7 +44,7 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), // HLE prefixes let isAsmParserOnly = 1 in { -def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>; -def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>; +def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>; +def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>; } diff --git a/lib/Target/X86/X86InstrTablesInfo.h b/lib/Target/X86/X86InstrTablesInfo.h deleted file mode 100755 index 415a891bfd97b..0000000000000 --- a/lib/Target/X86/X86InstrTablesInfo.h +++ /dev/null @@ -1,1162 +0,0 @@ -//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains related X86 Instruction Information Tables. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H -#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H - -using namespace llvm; - -struct X86EvexToVexCompressTableEntry { - uint16_t EvexOpcode; - uint16_t VexOpcode; -}; - - - -// X86 EVEX encoded instructions that have a VEX 128 encoding -// (table format: <EVEX opcode, VEX-128 opcode>). -static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = { - // EVEX scalar with corresponding VEX. - { X86::Int_VCOMISDZrm , X86::Int_VCOMISDrm }, - { X86::Int_VCOMISDZrr , X86::Int_VCOMISDrr }, - { X86::Int_VCOMISSZrm , X86::Int_VCOMISSrm }, - { X86::Int_VCOMISSZrr , X86::Int_VCOMISSrr }, - { X86::Int_VUCOMISDZrm , X86::Int_VUCOMISDrm }, - { X86::Int_VUCOMISDZrr , X86::Int_VUCOMISDrr }, - { X86::Int_VUCOMISSZrm , X86::Int_VUCOMISSrm }, - { X86::Int_VUCOMISSZrr , X86::Int_VUCOMISSrr }, - { X86::VADDSDZrm , X86::VADDSDrm }, - { X86::VADDSDZrm_Int , X86::VADDSDrm_Int }, - { X86::VADDSDZrr , X86::VADDSDrr }, - { X86::VADDSDZrr_Int , X86::VADDSDrr_Int }, - { X86::VADDSSZrm , X86::VADDSSrm }, - { X86::VADDSSZrm_Int , X86::VADDSSrm_Int }, - { X86::VADDSSZrr , X86::VADDSSrr }, - { X86::VADDSSZrr_Int , X86::VADDSSrr_Int }, - { X86::VCOMISDZrm , X86::VCOMISDrm }, - { X86::VCOMISDZrr , X86::VCOMISDrr }, - { X86::VCOMISSZrm , X86::VCOMISSrm }, - { X86::VCOMISSZrr , X86::VCOMISSrr }, - { X86::VCVTSD2SI64Zrm , X86::VCVTSD2SI64rm }, - { X86::VCVTSD2SI64Zrr , X86::VCVTSD2SI64rr }, - { X86::VCVTSD2SIZrm , X86::VCVTSD2SIrm }, - { X86::VCVTSD2SIZrr , X86::VCVTSD2SIrr }, - { X86::VCVTSD2SSZrm , X86::VCVTSD2SSrm }, - { X86::VCVTSD2SSZrr , X86::VCVTSD2SSrr }, - { X86::VCVTSI2SDZrm , X86::VCVTSI2SDrm }, - { X86::VCVTSI2SDZrm_Int , X86::Int_VCVTSI2SDrm }, - { X86::VCVTSI2SDZrr , X86::VCVTSI2SDrr }, - { X86::VCVTSI2SDZrr_Int , X86::Int_VCVTSI2SDrr }, - { X86::VCVTSI2SSZrm , X86::VCVTSI2SSrm }, - { X86::VCVTSI2SSZrm_Int , X86::Int_VCVTSI2SSrm }, - { X86::VCVTSI2SSZrr , X86::VCVTSI2SSrr }, - { X86::VCVTSI2SSZrr_Int , X86::Int_VCVTSI2SSrr }, - { X86::VCVTSS2SDZrm , X86::VCVTSS2SDrm }, - { X86::VCVTSS2SDZrr , X86::VCVTSS2SDrr }, - { X86::VCVTSS2SI64Zrm , X86::VCVTSS2SI64rm }, - { X86::VCVTSS2SI64Zrr , X86::VCVTSS2SI64rr }, - { X86::VCVTSS2SIZrm , X86::VCVTSS2SIrm }, - { X86::VCVTSS2SIZrr , X86::VCVTSS2SIrr }, - { X86::VCVTTSD2SI64Zrm , X86::VCVTTSD2SI64rm }, - { X86::VCVTTSD2SI64Zrm_Int , X86::Int_VCVTTSD2SI64rm }, - { X86::VCVTTSD2SI64Zrr , X86::VCVTTSD2SI64rr }, - { X86::VCVTTSD2SI64Zrr_Int , X86::Int_VCVTTSD2SI64rr }, - { X86::VCVTTSD2SIZrm , X86::VCVTTSD2SIrm }, - { X86::VCVTTSD2SIZrm_Int , X86::Int_VCVTTSD2SIrm }, - { X86::VCVTTSD2SIZrr , X86::VCVTTSD2SIrr }, - { X86::VCVTTSD2SIZrr_Int , X86::Int_VCVTTSD2SIrr }, - { X86::VCVTTSS2SI64Zrm , X86::VCVTTSS2SI64rm }, - { X86::VCVTTSS2SI64Zrm_Int , X86::Int_VCVTTSS2SI64rm }, - { X86::VCVTTSS2SI64Zrr , X86::VCVTTSS2SI64rr }, - { X86::VCVTTSS2SI64Zrr_Int , X86::Int_VCVTTSS2SI64rr }, - { X86::VCVTTSS2SIZrm , X86::VCVTTSS2SIrm }, - { X86::VCVTTSS2SIZrm_Int , X86::Int_VCVTTSS2SIrm }, - { X86::VCVTTSS2SIZrr , X86::VCVTTSS2SIrr }, - { X86::VCVTTSS2SIZrr_Int , X86::Int_VCVTTSS2SIrr }, - { X86::VDIVSDZrm , X86::VDIVSDrm }, - { X86::VDIVSDZrm_Int , X86::VDIVSDrm_Int }, - { X86::VDIVSDZrr , X86::VDIVSDrr }, - { X86::VDIVSDZrr_Int , X86::VDIVSDrr_Int }, - { X86::VDIVSSZrm , X86::VDIVSSrm }, - { X86::VDIVSSZrm_Int , X86::VDIVSSrm_Int }, - { X86::VDIVSSZrr , X86::VDIVSSrr }, - { X86::VDIVSSZrr_Int , X86::VDIVSSrr_Int }, - { X86::VFMADD132SDZm , X86::VFMADD132SDm }, - { X86::VFMADD132SDZm_Int , X86::VFMADD132SDm_Int }, - { X86::VFMADD132SDZr , X86::VFMADD132SDr }, - { X86::VFMADD132SDZr_Int , X86::VFMADD132SDr_Int }, - { X86::VFMADD132SSZm , X86::VFMADD132SSm }, - { X86::VFMADD132SSZm_Int , X86::VFMADD132SSm_Int }, - { X86::VFMADD132SSZr , X86::VFMADD132SSr }, - { X86::VFMADD132SSZr_Int , X86::VFMADD132SSr_Int }, - { X86::VFMADD213SDZm , X86::VFMADD213SDm }, - { X86::VFMADD213SDZm_Int , X86::VFMADD213SDm_Int }, - { X86::VFMADD213SDZr , X86::VFMADD213SDr }, - { X86::VFMADD213SDZr_Int , X86::VFMADD213SDr_Int }, - { X86::VFMADD213SSZm , X86::VFMADD213SSm }, - { X86::VFMADD213SSZm_Int , X86::VFMADD213SSm_Int }, - { X86::VFMADD213SSZr , X86::VFMADD213SSr }, - { X86::VFMADD213SSZr_Int , X86::VFMADD213SSr_Int }, - { X86::VFMADD231SDZm , X86::VFMADD231SDm }, - { X86::VFMADD231SDZm_Int , X86::VFMADD231SDm_Int }, - { X86::VFMADD231SDZr , X86::VFMADD231SDr }, - { X86::VFMADD231SDZr_Int , X86::VFMADD231SDr_Int }, - { X86::VFMADD231SSZm , X86::VFMADD231SSm }, - { X86::VFMADD231SSZm_Int , X86::VFMADD231SSm_Int }, - { X86::VFMADD231SSZr , X86::VFMADD231SSr }, - { X86::VFMADD231SSZr_Int , X86::VFMADD231SSr_Int }, - { X86::VFMSUB132SDZm , X86::VFMSUB132SDm }, - { X86::VFMSUB132SDZm_Int , X86::VFMSUB132SDm_Int }, - { X86::VFMSUB132SDZr , X86::VFMSUB132SDr }, - { X86::VFMSUB132SDZr_Int , X86::VFMSUB132SDr_Int }, - { X86::VFMSUB132SSZm , X86::VFMSUB132SSm }, - { X86::VFMSUB132SSZm_Int , X86::VFMSUB132SSm_Int }, - { X86::VFMSUB132SSZr , X86::VFMSUB132SSr }, - { X86::VFMSUB132SSZr_Int , X86::VFMSUB132SSr_Int }, - { X86::VFMSUB213SDZm , X86::VFMSUB213SDm }, - { X86::VFMSUB213SDZm_Int , X86::VFMSUB213SDm_Int }, - { X86::VFMSUB213SDZr , X86::VFMSUB213SDr }, - { X86::VFMSUB213SDZr_Int , X86::VFMSUB213SDr_Int }, - { X86::VFMSUB213SSZm , X86::VFMSUB213SSm }, - { X86::VFMSUB213SSZm_Int , X86::VFMSUB213SSm_Int }, - { X86::VFMSUB213SSZr , X86::VFMSUB213SSr }, - { X86::VFMSUB213SSZr_Int , X86::VFMSUB213SSr_Int }, - { X86::VFMSUB231SDZm , X86::VFMSUB231SDm }, - { X86::VFMSUB231SDZm_Int , X86::VFMSUB231SDm_Int }, - { X86::VFMSUB231SDZr , X86::VFMSUB231SDr }, - { X86::VFMSUB231SDZr_Int , X86::VFMSUB231SDr_Int }, - { X86::VFMSUB231SSZm , X86::VFMSUB231SSm }, - { X86::VFMSUB231SSZm_Int , X86::VFMSUB231SSm_Int }, - { X86::VFMSUB231SSZr , X86::VFMSUB231SSr }, - { X86::VFMSUB231SSZr_Int , X86::VFMSUB231SSr_Int }, - { X86::VFNMADD132SDZm , X86::VFNMADD132SDm }, - { X86::VFNMADD132SDZm_Int , X86::VFNMADD132SDm_Int }, - { X86::VFNMADD132SDZr , X86::VFNMADD132SDr }, - { X86::VFNMADD132SDZr_Int , X86::VFNMADD132SDr_Int }, - { X86::VFNMADD132SSZm , X86::VFNMADD132SSm }, - { X86::VFNMADD132SSZm_Int , X86::VFNMADD132SSm_Int }, - { X86::VFNMADD132SSZr , X86::VFNMADD132SSr }, - { X86::VFNMADD132SSZr_Int , X86::VFNMADD132SSr_Int }, - { X86::VFNMADD213SDZm , X86::VFNMADD213SDm }, - { X86::VFNMADD213SDZm_Int , X86::VFNMADD213SDm_Int }, - { X86::VFNMADD213SDZr , X86::VFNMADD213SDr }, - { X86::VFNMADD213SDZr_Int , X86::VFNMADD213SDr_Int }, - { X86::VFNMADD213SSZm , X86::VFNMADD213SSm }, - { X86::VFNMADD213SSZm_Int , X86::VFNMADD213SSm_Int }, - { X86::VFNMADD213SSZr , X86::VFNMADD213SSr }, - { X86::VFNMADD213SSZr_Int , X86::VFNMADD213SSr_Int }, - { X86::VFNMADD231SDZm , X86::VFNMADD231SDm }, - { X86::VFNMADD231SDZm_Int , X86::VFNMADD231SDm_Int }, - { X86::VFNMADD231SDZr , X86::VFNMADD231SDr }, - { X86::VFNMADD231SDZr_Int , X86::VFNMADD231SDr_Int }, - { X86::VFNMADD231SSZm , X86::VFNMADD231SSm }, - { X86::VFNMADD231SSZm_Int , X86::VFNMADD231SSm_Int }, - { X86::VFNMADD231SSZr , X86::VFNMADD231SSr }, - { X86::VFNMADD231SSZr_Int , X86::VFNMADD231SSr_Int }, - { X86::VFNMSUB132SDZm , X86::VFNMSUB132SDm }, - { X86::VFNMSUB132SDZm_Int , X86::VFNMSUB132SDm_Int }, - { X86::VFNMSUB132SDZr , X86::VFNMSUB132SDr }, - { X86::VFNMSUB132SDZr_Int , X86::VFNMSUB132SDr_Int }, - { X86::VFNMSUB132SSZm , X86::VFNMSUB132SSm }, - { X86::VFNMSUB132SSZm_Int , X86::VFNMSUB132SSm_Int }, - { X86::VFNMSUB132SSZr , X86::VFNMSUB132SSr }, - { X86::VFNMSUB132SSZr_Int , X86::VFNMSUB132SSr_Int }, - { X86::VFNMSUB213SDZm , X86::VFNMSUB213SDm }, - { X86::VFNMSUB213SDZm_Int , X86::VFNMSUB213SDm_Int }, - { X86::VFNMSUB213SDZr , X86::VFNMSUB213SDr }, - { X86::VFNMSUB213SDZr_Int , X86::VFNMSUB213SDr_Int }, - { X86::VFNMSUB213SSZm , X86::VFNMSUB213SSm }, - { X86::VFNMSUB213SSZm_Int , X86::VFNMSUB213SSm_Int }, - { X86::VFNMSUB213SSZr , X86::VFNMSUB213SSr }, - { X86::VFNMSUB213SSZr_Int , X86::VFNMSUB213SSr_Int }, - { X86::VFNMSUB231SDZm , X86::VFNMSUB231SDm }, - { X86::VFNMSUB231SDZm_Int , X86::VFNMSUB231SDm_Int }, - { X86::VFNMSUB231SDZr , X86::VFNMSUB231SDr }, - { X86::VFNMSUB231SDZr_Int , X86::VFNMSUB231SDr_Int }, - { X86::VFNMSUB231SSZm , X86::VFNMSUB231SSm }, - { X86::VFNMSUB231SSZm_Int , X86::VFNMSUB231SSm_Int }, - { X86::VFNMSUB231SSZr , X86::VFNMSUB231SSr }, - { X86::VFNMSUB231SSZr_Int , X86::VFNMSUB231SSr_Int }, - { X86::VMAXCSDZrm , X86::VMAXCSDrm }, - { X86::VMAXCSDZrr , X86::VMAXCSDrr }, - { X86::VMAXCSSZrm , X86::VMAXCSSrm }, - { X86::VMAXCSSZrr , X86::VMAXCSSrr }, - { X86::VMAXSDZrm , X86::VMAXSDrm }, - { X86::VMAXSDZrm_Int , X86::VMAXSDrm_Int }, - { X86::VMAXSDZrr , X86::VMAXSDrr }, - { X86::VMAXSDZrr_Int , X86::VMAXSDrr_Int }, - { X86::VMAXSSZrm , X86::VMAXSSrm }, - { X86::VMAXSSZrm_Int , X86::VMAXSSrm_Int }, - { X86::VMAXSSZrr , X86::VMAXSSrr }, - { X86::VMAXSSZrr_Int , X86::VMAXSSrr_Int }, - { X86::VMINCSDZrm , X86::VMINCSDrm }, - { X86::VMINCSDZrr , X86::VMINCSDrr }, - { X86::VMINCSSZrm , X86::VMINCSSrm }, - { X86::VMINCSSZrr , X86::VMINCSSrr }, - { X86::VMINSDZrm , X86::VMINSDrm }, - { X86::VMINSDZrm_Int , X86::VMINSDrm_Int }, - { X86::VMINSDZrr , X86::VMINSDrr }, - { X86::VMINSDZrr_Int , X86::VMINSDrr_Int }, - { X86::VMINSSZrm , X86::VMINSSrm }, - { X86::VMINSSZrm_Int , X86::VMINSSrm_Int }, - { X86::VMINSSZrr , X86::VMINSSrr }, - { X86::VMINSSZrr_Int , X86::VMINSSrr_Int }, - { X86::VMOV64toSDZrr , X86::VMOV64toSDrr }, - { X86::VMOVDI2SSZrm , X86::VMOVDI2SSrm }, - { X86::VMOVDI2SSZrr , X86::VMOVDI2SSrr }, - { X86::VMOVSDZmr , X86::VMOVSDmr }, - { X86::VMOVSDZrm , X86::VMOVSDrm }, - { X86::VMOVSDZrr , X86::VMOVSDrr }, - { X86::VMOVSSZmr , X86::VMOVSSmr }, - { X86::VMOVSSZrm , X86::VMOVSSrm }, - { X86::VMOVSSZrr , X86::VMOVSSrr }, - { X86::VMOVSSZrr_REV , X86::VMOVSSrr_REV }, - { X86::VMULSDZrm , X86::VMULSDrm }, - { X86::VMULSDZrm_Int , X86::VMULSDrm_Int }, - { X86::VMULSDZrr , X86::VMULSDrr }, - { X86::VMULSDZrr_Int , X86::VMULSDrr_Int }, - { X86::VMULSSZrm , X86::VMULSSrm }, - { X86::VMULSSZrm_Int , X86::VMULSSrm_Int }, - { X86::VMULSSZrr , X86::VMULSSrr }, - { X86::VMULSSZrr_Int , X86::VMULSSrr_Int }, - { X86::VSQRTSDZm , X86::VSQRTSDm }, - { X86::VSQRTSDZm_Int , X86::VSQRTSDm_Int }, - { X86::VSQRTSDZr , X86::VSQRTSDr }, - { X86::VSQRTSDZr_Int , X86::VSQRTSDr_Int }, - { X86::VSQRTSSZm , X86::VSQRTSSm }, - { X86::VSQRTSSZm_Int , X86::VSQRTSSm_Int }, - { X86::VSQRTSSZr , X86::VSQRTSSr }, - { X86::VSQRTSSZr_Int , X86::VSQRTSSr_Int }, - { X86::VSUBSDZrm , X86::VSUBSDrm }, - { X86::VSUBSDZrm_Int , X86::VSUBSDrm_Int }, - { X86::VSUBSDZrr , X86::VSUBSDrr }, - { X86::VSUBSDZrr_Int , X86::VSUBSDrr_Int }, - { X86::VSUBSSZrm , X86::VSUBSSrm }, - { X86::VSUBSSZrm_Int , X86::VSUBSSrm_Int }, - { X86::VSUBSSZrr , X86::VSUBSSrr }, - { X86::VSUBSSZrr_Int , X86::VSUBSSrr_Int }, - { X86::VUCOMISDZrm , X86::VUCOMISDrm }, - { X86::VUCOMISDZrr , X86::VUCOMISDrr }, - { X86::VUCOMISSZrm , X86::VUCOMISSrm }, - { X86::VUCOMISSZrr , X86::VUCOMISSrr }, - - { X86::VMOV64toPQIZrr , X86::VMOV64toPQIrr }, - { X86::VMOV64toSDZrr , X86::VMOV64toSDrr }, - { X86::VMOVDI2PDIZrm , X86::VMOVDI2PDIrm }, - { X86::VMOVDI2PDIZrr , X86::VMOVDI2PDIrr }, - { X86::VMOVLHPSZrr , X86::VMOVLHPSrr }, - { X86::VMOVHLPSZrr , X86::VMOVHLPSrr }, - { X86::VMOVPDI2DIZmr , X86::VMOVPDI2DImr }, - { X86::VMOVPDI2DIZrr , X86::VMOVPDI2DIrr }, - { X86::VMOVPQI2QIZmr , X86::VMOVPQI2QImr }, - { X86::VMOVPQIto64Zrr , X86::VMOVPQIto64rr }, - { X86::VMOVQI2PQIZrm , X86::VMOVQI2PQIrm }, - { X86::VMOVZPQILo2PQIZrr , X86::VMOVZPQILo2PQIrr }, - - { X86::VPEXTRBZmr , X86::VPEXTRBmr }, - { X86::VPEXTRBZrr , X86::VPEXTRBrr }, - { X86::VPEXTRDZmr , X86::VPEXTRDmr }, - { X86::VPEXTRDZrr , X86::VPEXTRDrr }, - { X86::VPEXTRQZmr , X86::VPEXTRQmr }, - { X86::VPEXTRQZrr , X86::VPEXTRQrr }, - { X86::VPEXTRWZmr , X86::VPEXTRWmr }, - { X86::VPEXTRWZrr , X86::VPEXTRWri }, - - { X86::VPINSRBZrm , X86::VPINSRBrm }, - { X86::VPINSRBZrr , X86::VPINSRBrr }, - { X86::VPINSRDZrm , X86::VPINSRDrm }, - { X86::VPINSRDZrr , X86::VPINSRDrr }, - { X86::VPINSRQZrm , X86::VPINSRQrm }, - { X86::VPINSRQZrr , X86::VPINSRQrr }, - { X86::VPINSRWZrm , X86::VPINSRWrmi }, - { X86::VPINSRWZrr , X86::VPINSRWrri }, - - // EVEX 128 with corresponding VEX. - { X86::VADDPDZ128rm , X86::VADDPDrm }, - { X86::VADDPDZ128rr , X86::VADDPDrr }, - { X86::VADDPSZ128rm , X86::VADDPSrm }, - { X86::VADDPSZ128rr , X86::VADDPSrr }, - { X86::VANDNPDZ128rm , X86::VANDNPDrm }, - { X86::VANDNPDZ128rr , X86::VANDNPDrr }, - { X86::VANDNPSZ128rm , X86::VANDNPSrm }, - { X86::VANDNPSZ128rr , X86::VANDNPSrr }, - { X86::VANDPDZ128rm , X86::VANDPDrm }, - { X86::VANDPDZ128rr , X86::VANDPDrr }, - { X86::VANDPSZ128rm , X86::VANDPSrm }, - { X86::VANDPSZ128rr , X86::VANDPSrr }, - { X86::VBROADCASTSSZ128m , X86::VBROADCASTSSrm }, - { X86::VBROADCASTSSZ128r , X86::VBROADCASTSSrr }, - { X86::VBROADCASTSSZ128r_s , X86::VBROADCASTSSrr }, - { X86::VCVTDQ2PDZ128rm , X86::VCVTDQ2PDrm }, - { X86::VCVTDQ2PDZ128rr , X86::VCVTDQ2PDrr }, - { X86::VCVTDQ2PSZ128rm , X86::VCVTDQ2PSrm }, - { X86::VCVTDQ2PSZ128rr , X86::VCVTDQ2PSrr }, - { X86::VCVTPD2DQZ128rm , X86::VCVTPD2DQrm }, - { X86::VCVTPD2DQZ128rr , X86::VCVTPD2DQrr }, - { X86::VCVTPD2PSZ128rm , X86::VCVTPD2PSrm }, - { X86::VCVTPD2PSZ128rr , X86::VCVTPD2PSrr }, - { X86::VCVTPH2PSZ128rm , X86::VCVTPH2PSrm }, - { X86::VCVTPH2PSZ128rr , X86::VCVTPH2PSrr }, - { X86::VCVTPS2DQZ128rm , X86::VCVTPS2DQrm }, - { X86::VCVTPS2DQZ128rr , X86::VCVTPS2DQrr }, - { X86::VCVTPS2PDZ128rm , X86::VCVTPS2PDrm }, - { X86::VCVTPS2PDZ128rr , X86::VCVTPS2PDrr }, - { X86::VCVTPS2PHZ128mr , X86::VCVTPS2PHmr }, - { X86::VCVTPS2PHZ128rr , X86::VCVTPS2PHrr }, - { X86::VCVTTPD2DQZ128rm , X86::VCVTTPD2DQrm }, - { X86::VCVTTPD2DQZ128rr , X86::VCVTTPD2DQrr }, - { X86::VCVTTPS2DQZ128rm , X86::VCVTTPS2DQrm }, - { X86::VCVTTPS2DQZ128rr , X86::VCVTTPS2DQrr }, - { X86::VDIVPDZ128rm , X86::VDIVPDrm }, - { X86::VDIVPDZ128rr , X86::VDIVPDrr }, - { X86::VDIVPSZ128rm , X86::VDIVPSrm }, - { X86::VDIVPSZ128rr , X86::VDIVPSrr }, - { X86::VFMADD132PDZ128m , X86::VFMADD132PDm }, - { X86::VFMADD132PDZ128r , X86::VFMADD132PDr }, - { X86::VFMADD132PSZ128m , X86::VFMADD132PSm }, - { X86::VFMADD132PSZ128r , X86::VFMADD132PSr }, - { X86::VFMADD213PDZ128m , X86::VFMADD213PDm }, - { X86::VFMADD213PDZ128r , X86::VFMADD213PDr }, - { X86::VFMADD213PSZ128m , X86::VFMADD213PSm }, - { X86::VFMADD213PSZ128r , X86::VFMADD213PSr }, - { X86::VFMADD231PDZ128m , X86::VFMADD231PDm }, - { X86::VFMADD231PDZ128r , X86::VFMADD231PDr }, - { X86::VFMADD231PSZ128m , X86::VFMADD231PSm }, - { X86::VFMADD231PSZ128r , X86::VFMADD231PSr }, - { X86::VFMADDSUB132PDZ128m , X86::VFMADDSUB132PDm }, - { X86::VFMADDSUB132PDZ128r , X86::VFMADDSUB132PDr }, - { X86::VFMADDSUB132PSZ128m , X86::VFMADDSUB132PSm }, - { X86::VFMADDSUB132PSZ128r , X86::VFMADDSUB132PSr }, - { X86::VFMADDSUB213PDZ128m , X86::VFMADDSUB213PDm }, - { X86::VFMADDSUB213PDZ128r , X86::VFMADDSUB213PDr }, - { X86::VFMADDSUB213PSZ128m , X86::VFMADDSUB213PSm }, - { X86::VFMADDSUB213PSZ128r , X86::VFMADDSUB213PSr }, - { X86::VFMADDSUB231PDZ128m , X86::VFMADDSUB231PDm }, - { X86::VFMADDSUB231PDZ128r , X86::VFMADDSUB231PDr }, - { X86::VFMADDSUB231PSZ128m , X86::VFMADDSUB231PSm }, - { X86::VFMADDSUB231PSZ128r , X86::VFMADDSUB231PSr }, - { X86::VFMSUB132PDZ128m , X86::VFMSUB132PDm }, - { X86::VFMSUB132PDZ128r , X86::VFMSUB132PDr }, - { X86::VFMSUB132PSZ128m , X86::VFMSUB132PSm }, - { X86::VFMSUB132PSZ128r , X86::VFMSUB132PSr }, - { X86::VFMSUB213PDZ128m , X86::VFMSUB213PDm }, - { X86::VFMSUB213PDZ128r , X86::VFMSUB213PDr }, - { X86::VFMSUB213PSZ128m , X86::VFMSUB213PSm }, - { X86::VFMSUB213PSZ128r , X86::VFMSUB213PSr }, - { X86::VFMSUB231PDZ128m , X86::VFMSUB231PDm }, - { X86::VFMSUB231PDZ128r , X86::VFMSUB231PDr }, - { X86::VFMSUB231PSZ128m , X86::VFMSUB231PSm }, - { X86::VFMSUB231PSZ128r , X86::VFMSUB231PSr }, - { X86::VFMSUBADD132PDZ128m , X86::VFMSUBADD132PDm }, - { X86::VFMSUBADD132PDZ128r , X86::VFMSUBADD132PDr }, - { X86::VFMSUBADD132PSZ128m , X86::VFMSUBADD132PSm }, - { X86::VFMSUBADD132PSZ128r , X86::VFMSUBADD132PSr }, - { X86::VFMSUBADD213PDZ128m , X86::VFMSUBADD213PDm }, - { X86::VFMSUBADD213PDZ128r , X86::VFMSUBADD213PDr }, - { X86::VFMSUBADD213PSZ128m , X86::VFMSUBADD213PSm }, - { X86::VFMSUBADD213PSZ128r , X86::VFMSUBADD213PSr }, - { X86::VFMSUBADD231PDZ128m , X86::VFMSUBADD231PDm }, - { X86::VFMSUBADD231PDZ128r , X86::VFMSUBADD231PDr }, - { X86::VFMSUBADD231PSZ128m , X86::VFMSUBADD231PSm }, - { X86::VFMSUBADD231PSZ128r , X86::VFMSUBADD231PSr }, - { X86::VFNMADD132PDZ128m , X86::VFNMADD132PDm }, - { X86::VFNMADD132PDZ128r , X86::VFNMADD132PDr }, - { X86::VFNMADD132PSZ128m , X86::VFNMADD132PSm }, - { X86::VFNMADD132PSZ128r , X86::VFNMADD132PSr }, - { X86::VFNMADD213PDZ128m , X86::VFNMADD213PDm }, - { X86::VFNMADD213PDZ128r , X86::VFNMADD213PDr }, - { X86::VFNMADD213PSZ128m , X86::VFNMADD213PSm }, - { X86::VFNMADD213PSZ128r , X86::VFNMADD213PSr }, - { X86::VFNMADD231PDZ128m , X86::VFNMADD231PDm }, - { X86::VFNMADD231PDZ128r , X86::VFNMADD231PDr }, - { X86::VFNMADD231PSZ128m , X86::VFNMADD231PSm }, - { X86::VFNMADD231PSZ128r , X86::VFNMADD231PSr }, - { X86::VFNMSUB132PDZ128m , X86::VFNMSUB132PDm }, - { X86::VFNMSUB132PDZ128r , X86::VFNMSUB132PDr }, - { X86::VFNMSUB132PSZ128m , X86::VFNMSUB132PSm }, - { X86::VFNMSUB132PSZ128r , X86::VFNMSUB132PSr }, - { X86::VFNMSUB213PDZ128m , X86::VFNMSUB213PDm }, - { X86::VFNMSUB213PDZ128r , X86::VFNMSUB213PDr }, - { X86::VFNMSUB213PSZ128m , X86::VFNMSUB213PSm }, - { X86::VFNMSUB213PSZ128r , X86::VFNMSUB213PSr }, - { X86::VFNMSUB231PDZ128m , X86::VFNMSUB231PDm }, - { X86::VFNMSUB231PDZ128r , X86::VFNMSUB231PDr }, - { X86::VFNMSUB231PSZ128m , X86::VFNMSUB231PSm }, - { X86::VFNMSUB231PSZ128r , X86::VFNMSUB231PSr }, - { X86::VMAXCPDZ128rm , X86::VMAXCPDrm }, - { X86::VMAXCPDZ128rr , X86::VMAXCPDrr }, - { X86::VMAXCPSZ128rm , X86::VMAXCPSrm }, - { X86::VMAXCPSZ128rr , X86::VMAXCPSrr }, - { X86::VMAXPDZ128rm , X86::VMAXPDrm }, - { X86::VMAXPDZ128rr , X86::VMAXPDrr }, - { X86::VMAXPSZ128rm , X86::VMAXPSrm }, - { X86::VMAXPSZ128rr , X86::VMAXPSrr }, - { X86::VMINCPDZ128rm , X86::VMINCPDrm }, - { X86::VMINCPDZ128rr , X86::VMINCPDrr }, - { X86::VMINCPSZ128rm , X86::VMINCPSrm }, - { X86::VMINCPSZ128rr , X86::VMINCPSrr }, - { X86::VMINPDZ128rm , X86::VMINPDrm }, - { X86::VMINPDZ128rr , X86::VMINPDrr }, - { X86::VMINPSZ128rm , X86::VMINPSrm }, - { X86::VMINPSZ128rr , X86::VMINPSrr }, - { X86::VMOVAPDZ128mr , X86::VMOVAPDmr }, - { X86::VMOVAPDZ128rm , X86::VMOVAPDrm }, - { X86::VMOVAPDZ128rr , X86::VMOVAPDrr }, - { X86::VMOVAPDZ128rr_REV , X86::VMOVAPDrr_REV }, - { X86::VMOVAPSZ128mr , X86::VMOVAPSmr }, - { X86::VMOVAPSZ128rm , X86::VMOVAPSrm }, - { X86::VMOVAPSZ128rr , X86::VMOVAPSrr }, - { X86::VMOVAPSZ128rr_REV , X86::VMOVAPSrr_REV }, - { X86::VMOVDDUPZ128rm , X86::VMOVDDUPrm }, - { X86::VMOVDDUPZ128rr , X86::VMOVDDUPrr }, - { X86::VMOVDQA32Z128mr , X86::VMOVDQAmr }, - { X86::VMOVDQA32Z128rm , X86::VMOVDQArm }, - { X86::VMOVDQA32Z128rr , X86::VMOVDQArr }, - { X86::VMOVDQA32Z128rr_REV , X86::VMOVDQArr_REV }, - { X86::VMOVDQA64Z128mr , X86::VMOVDQAmr }, - { X86::VMOVDQA64Z128rm , X86::VMOVDQArm }, - { X86::VMOVDQA64Z128rr , X86::VMOVDQArr }, - { X86::VMOVDQA64Z128rr_REV , X86::VMOVDQArr_REV }, - { X86::VMOVDQU16Z128mr , X86::VMOVDQUmr }, - { X86::VMOVDQU16Z128rm , X86::VMOVDQUrm }, - { X86::VMOVDQU16Z128rr , X86::VMOVDQUrr }, - { X86::VMOVDQU16Z128rr_REV , X86::VMOVDQUrr_REV }, - { X86::VMOVDQU32Z128mr , X86::VMOVDQUmr }, - { X86::VMOVDQU32Z128rm , X86::VMOVDQUrm }, - { X86::VMOVDQU32Z128rr , X86::VMOVDQUrr }, - { X86::VMOVDQU32Z128rr_REV , X86::VMOVDQUrr_REV }, - { X86::VMOVDQU64Z128mr , X86::VMOVDQUmr }, - { X86::VMOVDQU64Z128rm , X86::VMOVDQUrm }, - { X86::VMOVDQU64Z128rr , X86::VMOVDQUrr }, - { X86::VMOVDQU64Z128rr_REV , X86::VMOVDQUrr_REV }, - { X86::VMOVDQU8Z128mr , X86::VMOVDQUmr }, - { X86::VMOVDQU8Z128rm , X86::VMOVDQUrm }, - { X86::VMOVDQU8Z128rr , X86::VMOVDQUrr }, - { X86::VMOVDQU8Z128rr_REV , X86::VMOVDQUrr_REV }, - { X86::VMOVHPDZ128mr , X86::VMOVHPDmr }, - { X86::VMOVHPDZ128rm , X86::VMOVHPDrm }, - { X86::VMOVHPSZ128mr , X86::VMOVHPSmr }, - { X86::VMOVHPSZ128rm , X86::VMOVHPSrm }, - { X86::VMOVLPDZ128mr , X86::VMOVLPDmr }, - { X86::VMOVLPDZ128rm , X86::VMOVLPDrm }, - { X86::VMOVLPSZ128mr , X86::VMOVLPSmr }, - { X86::VMOVLPSZ128rm , X86::VMOVLPSrm }, - { X86::VMOVNTDQAZ128rm , X86::VMOVNTDQArm }, - { X86::VMOVNTDQZ128mr , X86::VMOVNTDQmr }, - { X86::VMOVNTPDZ128mr , X86::VMOVNTPDmr }, - { X86::VMOVNTPSZ128mr , X86::VMOVNTPSmr }, - { X86::VMOVSHDUPZ128rm , X86::VMOVSHDUPrm }, - { X86::VMOVSHDUPZ128rr , X86::VMOVSHDUPrr }, - { X86::VMOVSLDUPZ128rm , X86::VMOVSLDUPrm }, - { X86::VMOVSLDUPZ128rr , X86::VMOVSLDUPrr }, - { X86::VMOVUPDZ128mr , X86::VMOVUPDmr }, - { X86::VMOVUPDZ128rm , X86::VMOVUPDrm }, - { X86::VMOVUPDZ128rr , X86::VMOVUPDrr }, - { X86::VMOVUPDZ128rr_REV , X86::VMOVUPDrr_REV }, - { X86::VMOVUPSZ128mr , X86::VMOVUPSmr }, - { X86::VMOVUPSZ128rm , X86::VMOVUPSrm }, - { X86::VMOVUPSZ128rr , X86::VMOVUPSrr }, - { X86::VMOVUPSZ128rr_REV , X86::VMOVUPSrr_REV }, - { X86::VMULPDZ128rm , X86::VMULPDrm }, - { X86::VMULPDZ128rr , X86::VMULPDrr }, - { X86::VMULPSZ128rm , X86::VMULPSrm }, - { X86::VMULPSZ128rr , X86::VMULPSrr }, - { X86::VORPDZ128rm , X86::VORPDrm }, - { X86::VORPDZ128rr , X86::VORPDrr }, - { X86::VORPSZ128rm , X86::VORPSrm }, - { X86::VORPSZ128rr , X86::VORPSrr }, - { X86::VPABSBZ128rm , X86::VPABSBrm }, - { X86::VPABSBZ128rr , X86::VPABSBrr }, - { X86::VPABSDZ128rm , X86::VPABSDrm }, - { X86::VPABSDZ128rr , X86::VPABSDrr }, - { X86::VPABSWZ128rm , X86::VPABSWrm }, - { X86::VPABSWZ128rr , X86::VPABSWrr }, - { X86::VPACKSSDWZ128rm , X86::VPACKSSDWrm }, - { X86::VPACKSSDWZ128rr , X86::VPACKSSDWrr }, - { X86::VPACKSSWBZ128rm , X86::VPACKSSWBrm }, - { X86::VPACKSSWBZ128rr , X86::VPACKSSWBrr }, - { X86::VPACKUSDWZ128rm , X86::VPACKUSDWrm }, - { X86::VPACKUSDWZ128rr , X86::VPACKUSDWrr }, - { X86::VPACKUSWBZ128rm , X86::VPACKUSWBrm }, - { X86::VPACKUSWBZ128rr , X86::VPACKUSWBrr }, - { X86::VPADDBZ128rm , X86::VPADDBrm }, - { X86::VPADDBZ128rr , X86::VPADDBrr }, - { X86::VPADDDZ128rm , X86::VPADDDrm }, - { X86::VPADDDZ128rr , X86::VPADDDrr }, - { X86::VPADDQZ128rm , X86::VPADDQrm }, - { X86::VPADDQZ128rr , X86::VPADDQrr }, - { X86::VPADDSBZ128rm , X86::VPADDSBrm }, - { X86::VPADDSBZ128rr , X86::VPADDSBrr }, - { X86::VPADDSWZ128rm , X86::VPADDSWrm }, - { X86::VPADDSWZ128rr , X86::VPADDSWrr }, - { X86::VPADDUSBZ128rm , X86::VPADDUSBrm }, - { X86::VPADDUSBZ128rr , X86::VPADDUSBrr }, - { X86::VPADDUSWZ128rm , X86::VPADDUSWrm }, - { X86::VPADDUSWZ128rr , X86::VPADDUSWrr }, - { X86::VPADDWZ128rm , X86::VPADDWrm }, - { X86::VPADDWZ128rr , X86::VPADDWrr }, - { X86::VPALIGNRZ128rmi , X86::VPALIGNRrmi }, - { X86::VPALIGNRZ128rri , X86::VPALIGNRrri }, - { X86::VPANDDZ128rm , X86::VPANDrm }, - { X86::VPANDDZ128rr , X86::VPANDrr }, - { X86::VPANDQZ128rm , X86::VPANDrm }, - { X86::VPANDQZ128rr , X86::VPANDrr }, - { X86::VPAVGBZ128rm , X86::VPAVGBrm }, - { X86::VPAVGBZ128rr , X86::VPAVGBrr }, - { X86::VPAVGWZ128rm , X86::VPAVGWrm }, - { X86::VPAVGWZ128rr , X86::VPAVGWrr }, - { X86::VPBROADCASTBZ128m , X86::VPBROADCASTBrm }, - { X86::VPBROADCASTBZ128r , X86::VPBROADCASTBrr }, - { X86::VPBROADCASTDZ128m , X86::VPBROADCASTDrm }, - { X86::VPBROADCASTDZ128r , X86::VPBROADCASTDrr }, - { X86::VPBROADCASTQZ128m , X86::VPBROADCASTQrm }, - { X86::VPBROADCASTQZ128r , X86::VPBROADCASTQrr }, - { X86::VPBROADCASTWZ128m , X86::VPBROADCASTWrm }, - { X86::VPBROADCASTWZ128r , X86::VPBROADCASTWrr }, - { X86::VPERMILPDZ128mi , X86::VPERMILPDmi }, - { X86::VPERMILPDZ128ri , X86::VPERMILPDri }, - { X86::VPERMILPDZ128rm , X86::VPERMILPDrm }, - { X86::VPERMILPDZ128rr , X86::VPERMILPDrr }, - { X86::VPERMILPSZ128mi , X86::VPERMILPSmi }, - { X86::VPERMILPSZ128ri , X86::VPERMILPSri }, - { X86::VPERMILPSZ128rm , X86::VPERMILPSrm }, - { X86::VPERMILPSZ128rr , X86::VPERMILPSrr }, - { X86::VPMADDUBSWZ128rm , X86::VPMADDUBSWrm }, - { X86::VPMADDUBSWZ128rr , X86::VPMADDUBSWrr }, - { X86::VPMADDWDZ128rm , X86::VPMADDWDrm }, - { X86::VPMADDWDZ128rr , X86::VPMADDWDrr }, - { X86::VPMAXSBZ128rm , X86::VPMAXSBrm }, - { X86::VPMAXSBZ128rr , X86::VPMAXSBrr }, - { X86::VPMAXSDZ128rm , X86::VPMAXSDrm }, - { X86::VPMAXSDZ128rr , X86::VPMAXSDrr }, - { X86::VPMAXSWZ128rm , X86::VPMAXSWrm }, - { X86::VPMAXSWZ128rr , X86::VPMAXSWrr }, - { X86::VPMAXUBZ128rm , X86::VPMAXUBrm }, - { X86::VPMAXUBZ128rr , X86::VPMAXUBrr }, - { X86::VPMAXUDZ128rm , X86::VPMAXUDrm }, - { X86::VPMAXUDZ128rr , X86::VPMAXUDrr }, - { X86::VPMAXUWZ128rm , X86::VPMAXUWrm }, - { X86::VPMAXUWZ128rr , X86::VPMAXUWrr }, - { X86::VPMINSBZ128rm , X86::VPMINSBrm }, - { X86::VPMINSBZ128rr , X86::VPMINSBrr }, - { X86::VPMINSDZ128rm , X86::VPMINSDrm }, - { X86::VPMINSDZ128rr , X86::VPMINSDrr }, - { X86::VPMINSWZ128rm , X86::VPMINSWrm }, - { X86::VPMINSWZ128rr , X86::VPMINSWrr }, - { X86::VPMINUBZ128rm , X86::VPMINUBrm }, - { X86::VPMINUBZ128rr , X86::VPMINUBrr }, - { X86::VPMINUDZ128rm , X86::VPMINUDrm }, - { X86::VPMINUDZ128rr , X86::VPMINUDrr }, - { X86::VPMINUWZ128rm , X86::VPMINUWrm }, - { X86::VPMINUWZ128rr , X86::VPMINUWrr }, - { X86::VPMOVSXBDZ128rm , X86::VPMOVSXBDrm }, - { X86::VPMOVSXBDZ128rr , X86::VPMOVSXBDrr }, - { X86::VPMOVSXBQZ128rm , X86::VPMOVSXBQrm }, - { X86::VPMOVSXBQZ128rr , X86::VPMOVSXBQrr }, - { X86::VPMOVSXBWZ128rm , X86::VPMOVSXBWrm }, - { X86::VPMOVSXBWZ128rr , X86::VPMOVSXBWrr }, - { X86::VPMOVSXDQZ128rm , X86::VPMOVSXDQrm }, - { X86::VPMOVSXDQZ128rr , X86::VPMOVSXDQrr }, - { X86::VPMOVSXWDZ128rm , X86::VPMOVSXWDrm }, - { X86::VPMOVSXWDZ128rr , X86::VPMOVSXWDrr }, - { X86::VPMOVSXWQZ128rm , X86::VPMOVSXWQrm }, - { X86::VPMOVSXWQZ128rr , X86::VPMOVSXWQrr }, - { X86::VPMOVZXBDZ128rm , X86::VPMOVZXBDrm }, - { X86::VPMOVZXBDZ128rr , X86::VPMOVZXBDrr }, - { X86::VPMOVZXBQZ128rm , X86::VPMOVZXBQrm }, - { X86::VPMOVZXBQZ128rr , X86::VPMOVZXBQrr }, - { X86::VPMOVZXBWZ128rm , X86::VPMOVZXBWrm }, - { X86::VPMOVZXBWZ128rr , X86::VPMOVZXBWrr }, - { X86::VPMOVZXDQZ128rm , X86::VPMOVZXDQrm }, - { X86::VPMOVZXDQZ128rr , X86::VPMOVZXDQrr }, - { X86::VPMOVZXWDZ128rm , X86::VPMOVZXWDrm }, - { X86::VPMOVZXWDZ128rr , X86::VPMOVZXWDrr }, - { X86::VPMOVZXWQZ128rm , X86::VPMOVZXWQrm }, - { X86::VPMOVZXWQZ128rr , X86::VPMOVZXWQrr }, - { X86::VPMULDQZ128rm , X86::VPMULDQrm }, - { X86::VPMULDQZ128rr , X86::VPMULDQrr }, - { X86::VPMULHRSWZ128rm , X86::VPMULHRSWrm }, - { X86::VPMULHRSWZ128rr , X86::VPMULHRSWrr }, - { X86::VPMULHUWZ128rm , X86::VPMULHUWrm }, - { X86::VPMULHUWZ128rr , X86::VPMULHUWrr }, - { X86::VPMULHWZ128rm , X86::VPMULHWrm }, - { X86::VPMULHWZ128rr , X86::VPMULHWrr }, - { X86::VPMULLDZ128rm , X86::VPMULLDrm }, - { X86::VPMULLDZ128rr , X86::VPMULLDrr }, - { X86::VPMULLWZ128rm , X86::VPMULLWrm }, - { X86::VPMULLWZ128rr , X86::VPMULLWrr }, - { X86::VPMULUDQZ128rm , X86::VPMULUDQrm }, - { X86::VPMULUDQZ128rr , X86::VPMULUDQrr }, - { X86::VPORDZ128rm , X86::VPORrm }, - { X86::VPORDZ128rr , X86::VPORrr }, - { X86::VPORQZ128rm , X86::VPORrm }, - { X86::VPORQZ128rr , X86::VPORrr }, - { X86::VPSADBWZ128rm , X86::VPSADBWrm }, - { X86::VPSADBWZ128rr , X86::VPSADBWrr }, - { X86::VPSHUFBZ128rm , X86::VPSHUFBrm }, - { X86::VPSHUFBZ128rr , X86::VPSHUFBrr }, - { X86::VPSHUFDZ128mi , X86::VPSHUFDmi }, - { X86::VPSHUFDZ128ri , X86::VPSHUFDri }, - { X86::VPSHUFHWZ128mi , X86::VPSHUFHWmi }, - { X86::VPSHUFHWZ128ri , X86::VPSHUFHWri }, - { X86::VPSHUFLWZ128mi , X86::VPSHUFLWmi }, - { X86::VPSHUFLWZ128ri , X86::VPSHUFLWri }, - { X86::VPSLLDQZ128rr , X86::VPSLLDQri }, - { X86::VPSLLDZ128ri , X86::VPSLLDri }, - { X86::VPSLLDZ128rm , X86::VPSLLDrm }, - { X86::VPSLLDZ128rr , X86::VPSLLDrr }, - { X86::VPSLLQZ128ri , X86::VPSLLQri }, - { X86::VPSLLQZ128rm , X86::VPSLLQrm }, - { X86::VPSLLQZ128rr , X86::VPSLLQrr }, - { X86::VPSLLVDZ128rm , X86::VPSLLVDrm }, - { X86::VPSLLVDZ128rr , X86::VPSLLVDrr }, - { X86::VPSLLVQZ128rm , X86::VPSLLVQrm }, - { X86::VPSLLVQZ128rr , X86::VPSLLVQrr }, - { X86::VPSLLWZ128ri , X86::VPSLLWri }, - { X86::VPSLLWZ128rm , X86::VPSLLWrm }, - { X86::VPSLLWZ128rr , X86::VPSLLWrr }, - { X86::VPSRADZ128ri , X86::VPSRADri }, - { X86::VPSRADZ128rm , X86::VPSRADrm }, - { X86::VPSRADZ128rr , X86::VPSRADrr }, - { X86::VPSRAVDZ128rm , X86::VPSRAVDrm }, - { X86::VPSRAVDZ128rr , X86::VPSRAVDrr }, - { X86::VPSRAWZ128ri , X86::VPSRAWri }, - { X86::VPSRAWZ128rm , X86::VPSRAWrm }, - { X86::VPSRAWZ128rr , X86::VPSRAWrr }, - { X86::VPSRLDQZ128rr , X86::VPSRLDQri }, - { X86::VPSRLDZ128ri , X86::VPSRLDri }, - { X86::VPSRLDZ128rm , X86::VPSRLDrm }, - { X86::VPSRLDZ128rr , X86::VPSRLDrr }, - { X86::VPSRLQZ128ri , X86::VPSRLQri }, - { X86::VPSRLQZ128rm , X86::VPSRLQrm }, - { X86::VPSRLQZ128rr , X86::VPSRLQrr }, - { X86::VPSRLVDZ128rm , X86::VPSRLVDrm }, - { X86::VPSRLVDZ128rr , X86::VPSRLVDrr }, - { X86::VPSRLVQZ128rm , X86::VPSRLVQrm }, - { X86::VPSRLVQZ128rr , X86::VPSRLVQrr }, - { X86::VPSRLWZ128ri , X86::VPSRLWri }, - { X86::VPSRLWZ128rm , X86::VPSRLWrm }, - { X86::VPSRLWZ128rr , X86::VPSRLWrr }, - { X86::VPSUBBZ128rm , X86::VPSUBBrm }, - { X86::VPSUBBZ128rr , X86::VPSUBBrr }, - { X86::VPSUBDZ128rm , X86::VPSUBDrm }, - { X86::VPSUBDZ128rr , X86::VPSUBDrr }, - { X86::VPSUBQZ128rm , X86::VPSUBQrm }, - { X86::VPSUBQZ128rr , X86::VPSUBQrr }, - { X86::VPSUBSBZ128rm , X86::VPSUBSBrm }, - { X86::VPSUBSBZ128rr , X86::VPSUBSBrr }, - { X86::VPSUBSWZ128rm , X86::VPSUBSWrm }, - { X86::VPSUBSWZ128rr , X86::VPSUBSWrr }, - { X86::VPSUBUSBZ128rm , X86::VPSUBUSBrm }, - { X86::VPSUBUSBZ128rr , X86::VPSUBUSBrr }, - { X86::VPSUBUSWZ128rm , X86::VPSUBUSWrm }, - { X86::VPSUBUSWZ128rr , X86::VPSUBUSWrr }, - { X86::VPSUBWZ128rm , X86::VPSUBWrm }, - { X86::VPSUBWZ128rr , X86::VPSUBWrr }, - { X86::VPUNPCKHBWZ128rm , X86::VPUNPCKHBWrm }, - { X86::VPUNPCKHBWZ128rr , X86::VPUNPCKHBWrr }, - { X86::VPUNPCKHDQZ128rm , X86::VPUNPCKHDQrm }, - { X86::VPUNPCKHDQZ128rr , X86::VPUNPCKHDQrr }, - { X86::VPUNPCKHQDQZ128rm , X86::VPUNPCKHQDQrm }, - { X86::VPUNPCKHQDQZ128rr , X86::VPUNPCKHQDQrr }, - { X86::VPUNPCKHWDZ128rm , X86::VPUNPCKHWDrm }, - { X86::VPUNPCKHWDZ128rr , X86::VPUNPCKHWDrr }, - { X86::VPUNPCKLBWZ128rm , X86::VPUNPCKLBWrm }, - { X86::VPUNPCKLBWZ128rr , X86::VPUNPCKLBWrr }, - { X86::VPUNPCKLDQZ128rm , X86::VPUNPCKLDQrm }, - { X86::VPUNPCKLDQZ128rr , X86::VPUNPCKLDQrr }, - { X86::VPUNPCKLQDQZ128rm , X86::VPUNPCKLQDQrm }, - { X86::VPUNPCKLQDQZ128rr , X86::VPUNPCKLQDQrr }, - { X86::VPUNPCKLWDZ128rm , X86::VPUNPCKLWDrm }, - { X86::VPUNPCKLWDZ128rr , X86::VPUNPCKLWDrr }, - { X86::VPXORDZ128rm , X86::VPXORrm }, - { X86::VPXORDZ128rr , X86::VPXORrr }, - { X86::VPXORQZ128rm , X86::VPXORrm }, - { X86::VPXORQZ128rr , X86::VPXORrr }, - { X86::VSHUFPDZ128rmi , X86::VSHUFPDrmi }, - { X86::VSHUFPDZ128rri , X86::VSHUFPDrri }, - { X86::VSHUFPSZ128rmi , X86::VSHUFPSrmi }, - { X86::VSHUFPSZ128rri , X86::VSHUFPSrri }, - { X86::VSQRTPDZ128m , X86::VSQRTPDm }, - { X86::VSQRTPDZ128r , X86::VSQRTPDr }, - { X86::VSQRTPSZ128m , X86::VSQRTPSm }, - { X86::VSQRTPSZ128r , X86::VSQRTPSr }, - { X86::VSUBPDZ128rm , X86::VSUBPDrm }, - { X86::VSUBPDZ128rr , X86::VSUBPDrr }, - { X86::VSUBPSZ128rm , X86::VSUBPSrm }, - { X86::VSUBPSZ128rr , X86::VSUBPSrr }, - { X86::VUNPCKHPDZ128rm , X86::VUNPCKHPDrm }, - { X86::VUNPCKHPDZ128rr , X86::VUNPCKHPDrr }, - { X86::VUNPCKHPSZ128rm , X86::VUNPCKHPSrm }, - { X86::VUNPCKHPSZ128rr , X86::VUNPCKHPSrr }, - { X86::VUNPCKLPDZ128rm , X86::VUNPCKLPDrm }, - { X86::VUNPCKLPDZ128rr , X86::VUNPCKLPDrr }, - { X86::VUNPCKLPSZ128rm , X86::VUNPCKLPSrm }, - { X86::VUNPCKLPSZ128rr , X86::VUNPCKLPSrr }, - { X86::VXORPDZ128rm , X86::VXORPDrm }, - { X86::VXORPDZ128rr , X86::VXORPDrr }, - { X86::VXORPSZ128rm , X86::VXORPSrm }, - { X86::VXORPSZ128rr , X86::VXORPSrr }, -}; - - -// X86 EVEX encoded instructions that have a VEX 256 encoding -// (table format: <EVEX opcode, VEX-256 opcode>). - static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = { - { X86::VADDPDZ256rm , X86::VADDPDYrm }, - { X86::VADDPDZ256rr , X86::VADDPDYrr }, - { X86::VADDPSZ256rm , X86::VADDPSYrm }, - { X86::VADDPSZ256rr , X86::VADDPSYrr }, - { X86::VANDNPDZ256rm , X86::VANDNPDYrm }, - { X86::VANDNPDZ256rr , X86::VANDNPDYrr }, - { X86::VANDNPSZ256rm , X86::VANDNPSYrm }, - { X86::VANDNPSZ256rr , X86::VANDNPSYrr }, - { X86::VANDPDZ256rm , X86::VANDPDYrm }, - { X86::VANDPDZ256rr , X86::VANDPDYrr }, - { X86::VANDPSZ256rm , X86::VANDPSYrm }, - { X86::VANDPSZ256rr , X86::VANDPSYrr }, - { X86::VBROADCASTSDZ256m , X86::VBROADCASTSDYrm }, - { X86::VBROADCASTSDZ256r , X86::VBROADCASTSDYrr }, - { X86::VBROADCASTSDZ256r_s , X86::VBROADCASTSDYrr }, - { X86::VBROADCASTSSZ256m , X86::VBROADCASTSSYrm }, - { X86::VBROADCASTSSZ256r , X86::VBROADCASTSSYrr }, - { X86::VBROADCASTSSZ256r_s , X86::VBROADCASTSSYrr }, - { X86::VCVTDQ2PDZ256rm , X86::VCVTDQ2PDYrm }, - { X86::VCVTDQ2PDZ256rr , X86::VCVTDQ2PDYrr }, - { X86::VCVTDQ2PSZ256rm , X86::VCVTDQ2PSYrm }, - { X86::VCVTDQ2PSZ256rr , X86::VCVTDQ2PSYrr }, - { X86::VCVTPD2DQZ256rm , X86::VCVTPD2DQYrm }, - { X86::VCVTPD2DQZ256rr , X86::VCVTPD2DQYrr }, - { X86::VCVTPD2PSZ256rm , X86::VCVTPD2PSYrm }, - { X86::VCVTPD2PSZ256rr , X86::VCVTPD2PSYrr }, - { X86::VCVTPH2PSZ256rm , X86::VCVTPH2PSYrm }, - { X86::VCVTPH2PSZ256rr , X86::VCVTPH2PSYrr }, - { X86::VCVTPS2DQZ256rm , X86::VCVTPS2DQYrm }, - { X86::VCVTPS2DQZ256rr , X86::VCVTPS2DQYrr }, - { X86::VCVTPS2PDZ256rm , X86::VCVTPS2PDYrm }, - { X86::VCVTPS2PDZ256rr , X86::VCVTPS2PDYrr }, - { X86::VCVTPS2PHZ256mr , X86::VCVTPS2PHYmr }, - { X86::VCVTPS2PHZ256rr , X86::VCVTPS2PHYrr }, - { X86::VCVTTPD2DQZ256rm , X86::VCVTTPD2DQYrm }, - { X86::VCVTTPD2DQZ256rr , X86::VCVTTPD2DQYrr }, - { X86::VCVTTPS2DQZ256rm , X86::VCVTTPS2DQYrm }, - { X86::VCVTTPS2DQZ256rr , X86::VCVTTPS2DQYrr }, - { X86::VDIVPDZ256rm , X86::VDIVPDYrm }, - { X86::VDIVPDZ256rr , X86::VDIVPDYrr }, - { X86::VDIVPSZ256rm , X86::VDIVPSYrm }, - { X86::VDIVPSZ256rr , X86::VDIVPSYrr }, - { X86::VEXTRACTF32x4Z256mr , X86::VEXTRACTF128mr }, - { X86::VEXTRACTF64x2Z256mr , X86::VEXTRACTF128mr }, - { X86::VEXTRACTF32x4Z256rr , X86::VEXTRACTF128rr }, - { X86::VEXTRACTF64x2Z256rr , X86::VEXTRACTF128rr }, - { X86::VEXTRACTI32x4Z256mr , X86::VEXTRACTI128mr }, - { X86::VEXTRACTI64x2Z256mr , X86::VEXTRACTI128mr }, - { X86::VEXTRACTI32x4Z256rr , X86::VEXTRACTI128rr }, - { X86::VEXTRACTI64x2Z256rr , X86::VEXTRACTI128rr }, - { X86::VFMADD132PDZ256m , X86::VFMADD132PDYm }, - { X86::VFMADD132PDZ256r , X86::VFMADD132PDYr }, - { X86::VFMADD132PSZ256m , X86::VFMADD132PSYm }, - { X86::VFMADD132PSZ256r , X86::VFMADD132PSYr }, - { X86::VFMADD213PDZ256m , X86::VFMADD213PDYm }, - { X86::VFMADD213PDZ256r , X86::VFMADD213PDYr }, - { X86::VFMADD213PSZ256m , X86::VFMADD213PSYm }, - { X86::VFMADD213PSZ256r , X86::VFMADD213PSYr }, - { X86::VFMADD231PDZ256m , X86::VFMADD231PDYm }, - { X86::VFMADD231PDZ256r , X86::VFMADD231PDYr }, - { X86::VFMADD231PSZ256m , X86::VFMADD231PSYm }, - { X86::VFMADD231PSZ256r , X86::VFMADD231PSYr }, - { X86::VFMADDSUB132PDZ256m , X86::VFMADDSUB132PDYm }, - { X86::VFMADDSUB132PDZ256r , X86::VFMADDSUB132PDYr }, - { X86::VFMADDSUB132PSZ256m , X86::VFMADDSUB132PSYm }, - { X86::VFMADDSUB132PSZ256r , X86::VFMADDSUB132PSYr }, - { X86::VFMADDSUB213PDZ256m , X86::VFMADDSUB213PDYm }, - { X86::VFMADDSUB213PDZ256r , X86::VFMADDSUB213PDYr }, - { X86::VFMADDSUB213PSZ256m , X86::VFMADDSUB213PSYm }, - { X86::VFMADDSUB213PSZ256r , X86::VFMADDSUB213PSYr }, - { X86::VFMADDSUB231PDZ256m , X86::VFMADDSUB231PDYm }, - { X86::VFMADDSUB231PDZ256r , X86::VFMADDSUB231PDYr }, - { X86::VFMADDSUB231PSZ256m , X86::VFMADDSUB231PSYm }, - { X86::VFMADDSUB231PSZ256r , X86::VFMADDSUB231PSYr }, - { X86::VFMSUB132PDZ256m , X86::VFMSUB132PDYm }, - { X86::VFMSUB132PDZ256r , X86::VFMSUB132PDYr }, - { X86::VFMSUB132PSZ256m , X86::VFMSUB132PSYm }, - { X86::VFMSUB132PSZ256r , X86::VFMSUB132PSYr }, - { X86::VFMSUB213PDZ256m , X86::VFMSUB213PDYm }, - { X86::VFMSUB213PDZ256r , X86::VFMSUB213PDYr }, - { X86::VFMSUB213PSZ256m , X86::VFMSUB213PSYm }, - { X86::VFMSUB213PSZ256r , X86::VFMSUB213PSYr }, - { X86::VFMSUB231PDZ256m , X86::VFMSUB231PDYm }, - { X86::VFMSUB231PDZ256r , X86::VFMSUB231PDYr }, - { X86::VFMSUB231PSZ256m , X86::VFMSUB231PSYm }, - { X86::VFMSUB231PSZ256r , X86::VFMSUB231PSYr }, - { X86::VFMSUBADD132PDZ256m , X86::VFMSUBADD132PDYm }, - { X86::VFMSUBADD132PDZ256r , X86::VFMSUBADD132PDYr }, - { X86::VFMSUBADD132PSZ256m , X86::VFMSUBADD132PSYm }, - { X86::VFMSUBADD132PSZ256r , X86::VFMSUBADD132PSYr }, - { X86::VFMSUBADD213PDZ256m , X86::VFMSUBADD213PDYm }, - { X86::VFMSUBADD213PDZ256r , X86::VFMSUBADD213PDYr }, - { X86::VFMSUBADD213PSZ256m , X86::VFMSUBADD213PSYm }, - { X86::VFMSUBADD213PSZ256r , X86::VFMSUBADD213PSYr }, - { X86::VFMSUBADD231PDZ256m , X86::VFMSUBADD231PDYm }, - { X86::VFMSUBADD231PDZ256r , X86::VFMSUBADD231PDYr }, - { X86::VFMSUBADD231PSZ256m , X86::VFMSUBADD231PSYm }, - { X86::VFMSUBADD231PSZ256r , X86::VFMSUBADD231PSYr }, - { X86::VFNMADD132PDZ256m , X86::VFNMADD132PDYm }, - { X86::VFNMADD132PDZ256r , X86::VFNMADD132PDYr }, - { X86::VFNMADD132PSZ256m , X86::VFNMADD132PSYm }, - { X86::VFNMADD132PSZ256r , X86::VFNMADD132PSYr }, - { X86::VFNMADD213PDZ256m , X86::VFNMADD213PDYm }, - { X86::VFNMADD213PDZ256r , X86::VFNMADD213PDYr }, - { X86::VFNMADD213PSZ256m , X86::VFNMADD213PSYm }, - { X86::VFNMADD213PSZ256r , X86::VFNMADD213PSYr }, - { X86::VFNMADD231PDZ256m , X86::VFNMADD231PDYm }, - { X86::VFNMADD231PDZ256r , X86::VFNMADD231PDYr }, - { X86::VFNMADD231PSZ256m , X86::VFNMADD231PSYm }, - { X86::VFNMADD231PSZ256r , X86::VFNMADD231PSYr }, - { X86::VFNMSUB132PDZ256m , X86::VFNMSUB132PDYm }, - { X86::VFNMSUB132PDZ256r , X86::VFNMSUB132PDYr }, - { X86::VFNMSUB132PSZ256m , X86::VFNMSUB132PSYm }, - { X86::VFNMSUB132PSZ256r , X86::VFNMSUB132PSYr }, - { X86::VFNMSUB213PDZ256m , X86::VFNMSUB213PDYm }, - { X86::VFNMSUB213PDZ256r , X86::VFNMSUB213PDYr }, - { X86::VFNMSUB213PSZ256m , X86::VFNMSUB213PSYm }, - { X86::VFNMSUB213PSZ256r , X86::VFNMSUB213PSYr }, - { X86::VFNMSUB231PDZ256m , X86::VFNMSUB231PDYm }, - { X86::VFNMSUB231PDZ256r , X86::VFNMSUB231PDYr }, - { X86::VFNMSUB231PSZ256m , X86::VFNMSUB231PSYm }, - { X86::VFNMSUB231PSZ256r , X86::VFNMSUB231PSYr }, - { X86::VINSERTF32x4Z256rm , X86::VINSERTF128rm }, - { X86::VINSERTF64x2Z256rm , X86::VINSERTF128rm }, - { X86::VINSERTF32x4Z256rr , X86::VINSERTF128rr }, - { X86::VINSERTF64x2Z256rr , X86::VINSERTF128rr }, - { X86::VINSERTI32x4Z256rm , X86::VINSERTI128rm }, - { X86::VINSERTI64x2Z256rm , X86::VINSERTI128rm }, - { X86::VINSERTI32x4Z256rr , X86::VINSERTI128rr }, - { X86::VINSERTI64x2Z256rr , X86::VINSERTI128rr }, - { X86::VMAXCPDZ256rm , X86::VMAXCPDYrm }, - { X86::VMAXCPDZ256rr , X86::VMAXCPDYrr }, - { X86::VMAXCPSZ256rm , X86::VMAXCPSYrm }, - { X86::VMAXCPSZ256rr , X86::VMAXCPSYrr }, - { X86::VMAXPDZ256rm , X86::VMAXPDYrm }, - { X86::VMAXPDZ256rr , X86::VMAXPDYrr }, - { X86::VMAXPSZ256rm , X86::VMAXPSYrm }, - { X86::VMAXPSZ256rr , X86::VMAXPSYrr }, - { X86::VMINCPDZ256rm , X86::VMINCPDYrm }, - { X86::VMINCPDZ256rr , X86::VMINCPDYrr }, - { X86::VMINCPSZ256rm , X86::VMINCPSYrm }, - { X86::VMINCPSZ256rr , X86::VMINCPSYrr }, - { X86::VMINPDZ256rm , X86::VMINPDYrm }, - { X86::VMINPDZ256rr , X86::VMINPDYrr }, - { X86::VMINPSZ256rm , X86::VMINPSYrm }, - { X86::VMINPSZ256rr , X86::VMINPSYrr }, - { X86::VMOVAPDZ256mr , X86::VMOVAPDYmr }, - { X86::VMOVAPDZ256rm , X86::VMOVAPDYrm }, - { X86::VMOVAPDZ256rr , X86::VMOVAPDYrr }, - { X86::VMOVAPDZ256rr_REV , X86::VMOVAPDYrr_REV }, - { X86::VMOVAPSZ256mr , X86::VMOVAPSYmr }, - { X86::VMOVAPSZ256rm , X86::VMOVAPSYrm }, - { X86::VMOVAPSZ256rr , X86::VMOVAPSYrr }, - { X86::VMOVAPSZ256rr_REV , X86::VMOVAPSYrr_REV }, - { X86::VMOVDDUPZ256rm , X86::VMOVDDUPYrm }, - { X86::VMOVDDUPZ256rr , X86::VMOVDDUPYrr }, - { X86::VMOVDQA32Z256mr , X86::VMOVDQAYmr }, - { X86::VMOVDQA32Z256rm , X86::VMOVDQAYrm }, - { X86::VMOVDQA32Z256rr , X86::VMOVDQAYrr }, - { X86::VMOVDQA32Z256rr_REV , X86::VMOVDQAYrr_REV }, - { X86::VMOVDQA64Z256mr , X86::VMOVDQAYmr }, - { X86::VMOVDQA64Z256rm , X86::VMOVDQAYrm }, - { X86::VMOVDQA64Z256rr , X86::VMOVDQAYrr }, - { X86::VMOVDQA64Z256rr_REV , X86::VMOVDQAYrr_REV }, - { X86::VMOVDQU16Z256mr , X86::VMOVDQUYmr }, - { X86::VMOVDQU16Z256rm , X86::VMOVDQUYrm }, - { X86::VMOVDQU16Z256rr , X86::VMOVDQUYrr }, - { X86::VMOVDQU16Z256rr_REV , X86::VMOVDQUYrr_REV }, - { X86::VMOVDQU32Z256mr , X86::VMOVDQUYmr }, - { X86::VMOVDQU32Z256rm , X86::VMOVDQUYrm }, - { X86::VMOVDQU32Z256rr , X86::VMOVDQUYrr }, - { X86::VMOVDQU32Z256rr_REV , X86::VMOVDQUYrr_REV }, - { X86::VMOVDQU64Z256mr , X86::VMOVDQUYmr }, - { X86::VMOVDQU64Z256rm , X86::VMOVDQUYrm }, - { X86::VMOVDQU64Z256rr , X86::VMOVDQUYrr }, - { X86::VMOVDQU64Z256rr_REV , X86::VMOVDQUYrr_REV }, - { X86::VMOVDQU8Z256mr , X86::VMOVDQUYmr }, - { X86::VMOVDQU8Z256rm , X86::VMOVDQUYrm }, - { X86::VMOVDQU8Z256rr , X86::VMOVDQUYrr }, - { X86::VMOVDQU8Z256rr_REV , X86::VMOVDQUYrr_REV }, - { X86::VMOVNTDQAZ256rm , X86::VMOVNTDQAYrm }, - { X86::VMOVNTDQZ256mr , X86::VMOVNTDQYmr }, - { X86::VMOVNTPDZ256mr , X86::VMOVNTPDYmr }, - { X86::VMOVNTPSZ256mr , X86::VMOVNTPSYmr }, - { X86::VMOVSHDUPZ256rm , X86::VMOVSHDUPYrm }, - { X86::VMOVSHDUPZ256rr , X86::VMOVSHDUPYrr }, - { X86::VMOVSLDUPZ256rm , X86::VMOVSLDUPYrm }, - { X86::VMOVSLDUPZ256rr , X86::VMOVSLDUPYrr }, - { X86::VMOVUPDZ256mr , X86::VMOVUPDYmr }, - { X86::VMOVUPDZ256rm , X86::VMOVUPDYrm }, - { X86::VMOVUPDZ256rr , X86::VMOVUPDYrr }, - { X86::VMOVUPDZ256rr_REV , X86::VMOVUPDYrr_REV }, - { X86::VMOVUPSZ256mr , X86::VMOVUPSYmr }, - { X86::VMOVUPSZ256rm , X86::VMOVUPSYrm }, - { X86::VMOVUPSZ256rr , X86::VMOVUPSYrr }, - { X86::VMOVUPSZ256rr_REV , X86::VMOVUPSYrr_REV }, - { X86::VMULPDZ256rm , X86::VMULPDYrm }, - { X86::VMULPDZ256rr , X86::VMULPDYrr }, - { X86::VMULPSZ256rm , X86::VMULPSYrm }, - { X86::VMULPSZ256rr , X86::VMULPSYrr }, - { X86::VORPDZ256rm , X86::VORPDYrm }, - { X86::VORPDZ256rr , X86::VORPDYrr }, - { X86::VORPSZ256rm , X86::VORPSYrm }, - { X86::VORPSZ256rr , X86::VORPSYrr }, - { X86::VPABSBZ256rm , X86::VPABSBYrm }, - { X86::VPABSBZ256rr , X86::VPABSBYrr }, - { X86::VPABSDZ256rm , X86::VPABSDYrm }, - { X86::VPABSDZ256rr , X86::VPABSDYrr }, - { X86::VPABSWZ256rm , X86::VPABSWYrm }, - { X86::VPABSWZ256rr , X86::VPABSWYrr }, - { X86::VPACKSSDWZ256rm , X86::VPACKSSDWYrm }, - { X86::VPACKSSDWZ256rr , X86::VPACKSSDWYrr }, - { X86::VPACKSSWBZ256rm , X86::VPACKSSWBYrm }, - { X86::VPACKSSWBZ256rr , X86::VPACKSSWBYrr }, - { X86::VPACKUSDWZ256rm , X86::VPACKUSDWYrm }, - { X86::VPACKUSDWZ256rr , X86::VPACKUSDWYrr }, - { X86::VPACKUSWBZ256rm , X86::VPACKUSWBYrm }, - { X86::VPACKUSWBZ256rr , X86::VPACKUSWBYrr }, - { X86::VPADDBZ256rm , X86::VPADDBYrm }, - { X86::VPADDBZ256rr , X86::VPADDBYrr }, - { X86::VPADDDZ256rm , X86::VPADDDYrm }, - { X86::VPADDDZ256rr , X86::VPADDDYrr }, - { X86::VPADDQZ256rm , X86::VPADDQYrm }, - { X86::VPADDQZ256rr , X86::VPADDQYrr }, - { X86::VPADDSBZ256rm , X86::VPADDSBYrm }, - { X86::VPADDSBZ256rr , X86::VPADDSBYrr }, - { X86::VPADDSWZ256rm , X86::VPADDSWYrm }, - { X86::VPADDSWZ256rr , X86::VPADDSWYrr }, - { X86::VPADDUSBZ256rm , X86::VPADDUSBYrm }, - { X86::VPADDUSBZ256rr , X86::VPADDUSBYrr }, - { X86::VPADDUSWZ256rm , X86::VPADDUSWYrm }, - { X86::VPADDUSWZ256rr , X86::VPADDUSWYrr }, - { X86::VPADDWZ256rm , X86::VPADDWYrm }, - { X86::VPADDWZ256rr , X86::VPADDWYrr }, - { X86::VPALIGNRZ256rmi , X86::VPALIGNRYrmi }, - { X86::VPALIGNRZ256rri , X86::VPALIGNRYrri }, - { X86::VPANDDZ256rm , X86::VPANDYrm }, - { X86::VPANDDZ256rr , X86::VPANDYrr }, - { X86::VPANDQZ256rm , X86::VPANDYrm }, - { X86::VPANDQZ256rr , X86::VPANDYrr }, - { X86::VPAVGBZ256rm , X86::VPAVGBYrm }, - { X86::VPAVGBZ256rr , X86::VPAVGBYrr }, - { X86::VPAVGWZ256rm , X86::VPAVGWYrm }, - { X86::VPAVGWZ256rr , X86::VPAVGWYrr }, - { X86::VPBROADCASTBZ256m , X86::VPBROADCASTBYrm }, - { X86::VPBROADCASTBZ256r , X86::VPBROADCASTBYrr }, - { X86::VPBROADCASTDZ256m , X86::VPBROADCASTDYrm }, - { X86::VPBROADCASTDZ256r , X86::VPBROADCASTDYrr }, - { X86::VPBROADCASTQZ256m , X86::VPBROADCASTQYrm }, - { X86::VPBROADCASTQZ256r , X86::VPBROADCASTQYrr }, - { X86::VPBROADCASTWZ256m , X86::VPBROADCASTWYrm }, - { X86::VPBROADCASTWZ256r , X86::VPBROADCASTWYrr }, - { X86::VPERMDZ256rm , X86::VPERMDYrm }, - { X86::VPERMDZ256rr , X86::VPERMDYrr }, - { X86::VPERMILPDZ256mi , X86::VPERMILPDYmi }, - { X86::VPERMILPDZ256ri , X86::VPERMILPDYri }, - { X86::VPERMILPDZ256rm , X86::VPERMILPDYrm }, - { X86::VPERMILPDZ256rr , X86::VPERMILPDYrr }, - { X86::VPERMILPSZ256mi , X86::VPERMILPSYmi }, - { X86::VPERMILPSZ256ri , X86::VPERMILPSYri }, - { X86::VPERMILPSZ256rm , X86::VPERMILPSYrm }, - { X86::VPERMILPSZ256rr , X86::VPERMILPSYrr }, - { X86::VPERMPDZ256mi , X86::VPERMPDYmi }, - { X86::VPERMPDZ256ri , X86::VPERMPDYri }, - { X86::VPERMPSZ256rm , X86::VPERMPSYrm }, - { X86::VPERMPSZ256rr , X86::VPERMPSYrr }, - { X86::VPERMQZ256mi , X86::VPERMQYmi }, - { X86::VPERMQZ256ri , X86::VPERMQYri }, - { X86::VPMADDUBSWZ256rm , X86::VPMADDUBSWYrm }, - { X86::VPMADDUBSWZ256rr , X86::VPMADDUBSWYrr }, - { X86::VPMADDWDZ256rm , X86::VPMADDWDYrm }, - { X86::VPMADDWDZ256rr , X86::VPMADDWDYrr }, - { X86::VPMAXSBZ256rm , X86::VPMAXSBYrm }, - { X86::VPMAXSBZ256rr , X86::VPMAXSBYrr }, - { X86::VPMAXSDZ256rm , X86::VPMAXSDYrm }, - { X86::VPMAXSDZ256rr , X86::VPMAXSDYrr }, - { X86::VPMAXSWZ256rm , X86::VPMAXSWYrm }, - { X86::VPMAXSWZ256rr , X86::VPMAXSWYrr }, - { X86::VPMAXUBZ256rm , X86::VPMAXUBYrm }, - { X86::VPMAXUBZ256rr , X86::VPMAXUBYrr }, - { X86::VPMAXUDZ256rm , X86::VPMAXUDYrm }, - { X86::VPMAXUDZ256rr , X86::VPMAXUDYrr }, - { X86::VPMAXUWZ256rm , X86::VPMAXUWYrm }, - { X86::VPMAXUWZ256rr , X86::VPMAXUWYrr }, - { X86::VPMINSBZ256rm , X86::VPMINSBYrm }, - { X86::VPMINSBZ256rr , X86::VPMINSBYrr }, - { X86::VPMINSDZ256rm , X86::VPMINSDYrm }, - { X86::VPMINSDZ256rr , X86::VPMINSDYrr }, - { X86::VPMINSWZ256rm , X86::VPMINSWYrm }, - { X86::VPMINSWZ256rr , X86::VPMINSWYrr }, - { X86::VPMINUBZ256rm , X86::VPMINUBYrm }, - { X86::VPMINUBZ256rr , X86::VPMINUBYrr }, - { X86::VPMINUDZ256rm , X86::VPMINUDYrm }, - { X86::VPMINUDZ256rr , X86::VPMINUDYrr }, - { X86::VPMINUWZ256rm , X86::VPMINUWYrm }, - { X86::VPMINUWZ256rr , X86::VPMINUWYrr }, - { X86::VPMOVSXBDZ256rm , X86::VPMOVSXBDYrm }, - { X86::VPMOVSXBDZ256rr , X86::VPMOVSXBDYrr }, - { X86::VPMOVSXBQZ256rm , X86::VPMOVSXBQYrm }, - { X86::VPMOVSXBQZ256rr , X86::VPMOVSXBQYrr }, - { X86::VPMOVSXBWZ256rm , X86::VPMOVSXBWYrm }, - { X86::VPMOVSXBWZ256rr , X86::VPMOVSXBWYrr }, - { X86::VPMOVSXDQZ256rm , X86::VPMOVSXDQYrm }, - { X86::VPMOVSXDQZ256rr , X86::VPMOVSXDQYrr }, - { X86::VPMOVSXWDZ256rm , X86::VPMOVSXWDYrm }, - { X86::VPMOVSXWDZ256rr , X86::VPMOVSXWDYrr }, - { X86::VPMOVSXWQZ256rm , X86::VPMOVSXWQYrm }, - { X86::VPMOVSXWQZ256rr , X86::VPMOVSXWQYrr }, - { X86::VPMOVZXBDZ256rm , X86::VPMOVZXBDYrm }, - { X86::VPMOVZXBDZ256rr , X86::VPMOVZXBDYrr }, - { X86::VPMOVZXBQZ256rm , X86::VPMOVZXBQYrm }, - { X86::VPMOVZXBQZ256rr , X86::VPMOVZXBQYrr }, - { X86::VPMOVZXBWZ256rm , X86::VPMOVZXBWYrm }, - { X86::VPMOVZXBWZ256rr , X86::VPMOVZXBWYrr }, - { X86::VPMOVZXDQZ256rm , X86::VPMOVZXDQYrm }, - { X86::VPMOVZXDQZ256rr , X86::VPMOVZXDQYrr }, - { X86::VPMOVZXWDZ256rm , X86::VPMOVZXWDYrm }, - { X86::VPMOVZXWDZ256rr , X86::VPMOVZXWDYrr }, - { X86::VPMOVZXWQZ256rm , X86::VPMOVZXWQYrm }, - { X86::VPMOVZXWQZ256rr , X86::VPMOVZXWQYrr }, - { X86::VPMULDQZ256rm , X86::VPMULDQYrm }, - { X86::VPMULDQZ256rr , X86::VPMULDQYrr }, - { X86::VPMULHRSWZ256rm , X86::VPMULHRSWYrm }, - { X86::VPMULHRSWZ256rr , X86::VPMULHRSWYrr }, - { X86::VPMULHUWZ256rm , X86::VPMULHUWYrm }, - { X86::VPMULHUWZ256rr , X86::VPMULHUWYrr }, - { X86::VPMULHWZ256rm , X86::VPMULHWYrm }, - { X86::VPMULHWZ256rr , X86::VPMULHWYrr }, - { X86::VPMULLDZ256rm , X86::VPMULLDYrm }, - { X86::VPMULLDZ256rr , X86::VPMULLDYrr }, - { X86::VPMULLWZ256rm , X86::VPMULLWYrm }, - { X86::VPMULLWZ256rr , X86::VPMULLWYrr }, - { X86::VPMULUDQZ256rm , X86::VPMULUDQYrm }, - { X86::VPMULUDQZ256rr , X86::VPMULUDQYrr }, - { X86::VPORDZ256rm , X86::VPORYrm }, - { X86::VPORDZ256rr , X86::VPORYrr }, - { X86::VPORQZ256rm , X86::VPORYrm }, - { X86::VPORQZ256rr , X86::VPORYrr }, - { X86::VPSADBWZ256rm , X86::VPSADBWYrm }, - { X86::VPSADBWZ256rr , X86::VPSADBWYrr }, - { X86::VPSHUFBZ256rm , X86::VPSHUFBYrm }, - { X86::VPSHUFBZ256rr , X86::VPSHUFBYrr }, - { X86::VPSHUFDZ256mi , X86::VPSHUFDYmi }, - { X86::VPSHUFDZ256ri , X86::VPSHUFDYri }, - { X86::VPSHUFHWZ256mi , X86::VPSHUFHWYmi }, - { X86::VPSHUFHWZ256ri , X86::VPSHUFHWYri }, - { X86::VPSHUFLWZ256mi , X86::VPSHUFLWYmi }, - { X86::VPSHUFLWZ256ri , X86::VPSHUFLWYri }, - { X86::VPSLLDQZ256rr , X86::VPSLLDQYri }, - { X86::VPSLLDZ256ri , X86::VPSLLDYri }, - { X86::VPSLLDZ256rm , X86::VPSLLDYrm }, - { X86::VPSLLDZ256rr , X86::VPSLLDYrr }, - { X86::VPSLLQZ256ri , X86::VPSLLQYri }, - { X86::VPSLLQZ256rm , X86::VPSLLQYrm }, - { X86::VPSLLQZ256rr , X86::VPSLLQYrr }, - { X86::VPSLLVDZ256rm , X86::VPSLLVDYrm }, - { X86::VPSLLVDZ256rr , X86::VPSLLVDYrr }, - { X86::VPSLLVQZ256rm , X86::VPSLLVQYrm }, - { X86::VPSLLVQZ256rr , X86::VPSLLVQYrr }, - { X86::VPSLLWZ256ri , X86::VPSLLWYri }, - { X86::VPSLLWZ256rm , X86::VPSLLWYrm }, - { X86::VPSLLWZ256rr , X86::VPSLLWYrr }, - { X86::VPSRADZ256ri , X86::VPSRADYri }, - { X86::VPSRADZ256rm , X86::VPSRADYrm }, - { X86::VPSRADZ256rr , X86::VPSRADYrr }, - { X86::VPSRAVDZ256rm , X86::VPSRAVDYrm }, - { X86::VPSRAVDZ256rr , X86::VPSRAVDYrr }, - { X86::VPSRAWZ256ri , X86::VPSRAWYri }, - { X86::VPSRAWZ256rm , X86::VPSRAWYrm }, - { X86::VPSRAWZ256rr , X86::VPSRAWYrr }, - { X86::VPSRLDQZ256rr , X86::VPSRLDQYri }, - { X86::VPSRLDZ256ri , X86::VPSRLDYri }, - { X86::VPSRLDZ256rm , X86::VPSRLDYrm }, - { X86::VPSRLDZ256rr , X86::VPSRLDYrr }, - { X86::VPSRLQZ256ri , X86::VPSRLQYri }, - { X86::VPSRLQZ256rm , X86::VPSRLQYrm }, - { X86::VPSRLQZ256rr , X86::VPSRLQYrr }, - { X86::VPSRLVDZ256rm , X86::VPSRLVDYrm }, - { X86::VPSRLVDZ256rr , X86::VPSRLVDYrr }, - { X86::VPSRLVQZ256rm , X86::VPSRLVQYrm }, - { X86::VPSRLVQZ256rr , X86::VPSRLVQYrr }, - { X86::VPSRLWZ256ri , X86::VPSRLWYri }, - { X86::VPSRLWZ256rm , X86::VPSRLWYrm }, - { X86::VPSRLWZ256rr , X86::VPSRLWYrr }, - { X86::VPSUBBZ256rm , X86::VPSUBBYrm }, - { X86::VPSUBBZ256rr , X86::VPSUBBYrr }, - { X86::VPSUBDZ256rm , X86::VPSUBDYrm }, - { X86::VPSUBDZ256rr , X86::VPSUBDYrr }, - { X86::VPSUBQZ256rm , X86::VPSUBQYrm }, - { X86::VPSUBQZ256rr , X86::VPSUBQYrr }, - { X86::VPSUBSBZ256rm , X86::VPSUBSBYrm }, - { X86::VPSUBSBZ256rr , X86::VPSUBSBYrr }, - { X86::VPSUBSWZ256rm , X86::VPSUBSWYrm }, - { X86::VPSUBSWZ256rr , X86::VPSUBSWYrr }, - { X86::VPSUBUSBZ256rm , X86::VPSUBUSBYrm }, - { X86::VPSUBUSBZ256rr , X86::VPSUBUSBYrr }, - { X86::VPSUBUSWZ256rm , X86::VPSUBUSWYrm }, - { X86::VPSUBUSWZ256rr , X86::VPSUBUSWYrr }, - { X86::VPSUBWZ256rm , X86::VPSUBWYrm }, - { X86::VPSUBWZ256rr , X86::VPSUBWYrr }, - { X86::VPUNPCKHBWZ256rm , X86::VPUNPCKHBWYrm }, - { X86::VPUNPCKHBWZ256rr , X86::VPUNPCKHBWYrr }, - { X86::VPUNPCKHDQZ256rm , X86::VPUNPCKHDQYrm }, - { X86::VPUNPCKHDQZ256rr , X86::VPUNPCKHDQYrr }, - { X86::VPUNPCKHQDQZ256rm , X86::VPUNPCKHQDQYrm }, - { X86::VPUNPCKHQDQZ256rr , X86::VPUNPCKHQDQYrr }, - { X86::VPUNPCKHWDZ256rm , X86::VPUNPCKHWDYrm }, - { X86::VPUNPCKHWDZ256rr , X86::VPUNPCKHWDYrr }, - { X86::VPUNPCKLBWZ256rm , X86::VPUNPCKLBWYrm }, - { X86::VPUNPCKLBWZ256rr , X86::VPUNPCKLBWYrr }, - { X86::VPUNPCKLDQZ256rm , X86::VPUNPCKLDQYrm }, - { X86::VPUNPCKLDQZ256rr , X86::VPUNPCKLDQYrr }, - { X86::VPUNPCKLQDQZ256rm , X86::VPUNPCKLQDQYrm }, - { X86::VPUNPCKLQDQZ256rr , X86::VPUNPCKLQDQYrr }, - { X86::VPUNPCKLWDZ256rm , X86::VPUNPCKLWDYrm }, - { X86::VPUNPCKLWDZ256rr , X86::VPUNPCKLWDYrr }, - { X86::VPXORDZ256rm , X86::VPXORYrm }, - { X86::VPXORDZ256rr , X86::VPXORYrr }, - { X86::VPXORQZ256rm , X86::VPXORYrm }, - { X86::VPXORQZ256rr , X86::VPXORYrr }, - { X86::VSHUFPDZ256rmi , X86::VSHUFPDYrmi }, - { X86::VSHUFPDZ256rri , X86::VSHUFPDYrri }, - { X86::VSHUFPSZ256rmi , X86::VSHUFPSYrmi }, - { X86::VSHUFPSZ256rri , X86::VSHUFPSYrri }, - { X86::VSQRTPDZ256m , X86::VSQRTPDYm }, - { X86::VSQRTPDZ256r , X86::VSQRTPDYr }, - { X86::VSQRTPSZ256m , X86::VSQRTPSYm }, - { X86::VSQRTPSZ256r , X86::VSQRTPSYr }, - { X86::VSUBPDZ256rm , X86::VSUBPDYrm }, - { X86::VSUBPDZ256rr , X86::VSUBPDYrr }, - { X86::VSUBPSZ256rm , X86::VSUBPSYrm }, - { X86::VSUBPSZ256rr , X86::VSUBPSYrr }, - { X86::VUNPCKHPDZ256rm , X86::VUNPCKHPDYrm }, - { X86::VUNPCKHPDZ256rr , X86::VUNPCKHPDYrr }, - { X86::VUNPCKHPSZ256rm , X86::VUNPCKHPSYrm }, - { X86::VUNPCKHPSZ256rr , X86::VUNPCKHPSYrr }, - { X86::VUNPCKLPDZ256rm , X86::VUNPCKLPDYrm }, - { X86::VUNPCKLPDZ256rr , X86::VUNPCKLPDYrr }, - { X86::VUNPCKLPSZ256rm , X86::VUNPCKLPSYrm }, - { X86::VUNPCKLPSZ256rr , X86::VUNPCKLPSYrr }, - { X86::VXORPDZ256rm , X86::VXORPDYrm }, - { X86::VXORPDZ256rr , X86::VXORPDYrr }, - { X86::VXORPSZ256rm , X86::VXORPSYrm }, - { X86::VXORPSZ256rr , X86::VXORPSYrr }, -}; - -#endif diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td index 2ea27a934b478..315a69e6a2a24 100644 --- a/lib/Target/X86/X86InstrVMX.td +++ b/lib/Target/X86/X86InstrVMX.td @@ -43,22 +43,26 @@ def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), "vmptrld\t$vmcs", []>, PS; def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs), "vmptrst\t$vmcs", []>, TB; -def VMREAD64rm : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), - "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; -def VMREAD32rm : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; -def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +let mayStore = 1 in { +def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +} def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; -def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +let mayLoad = 1 in { +def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; +def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; +} // 0F 01 C4 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 2b296e1e5b85c..53224431c0e90 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -183,6 +183,27 @@ let ExeDomain = SSEPackedInt in { defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; } +// IFMA patterns - for cases where we can safely ignore the overflow bits from +// the multiply or easily match with existing intrinsics. +let Predicates = [HasXOP] in { + def : Pat<(v8i16 (add (mul (v8i16 VR128:$src1), (v8i16 VR128:$src2)), + (v8i16 VR128:$src3))), + (VPMACSWWrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)), + (v4i32 VR128:$src3))), + (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)), + (X86PShufd (v4i32 VR128:$src2), (i8 -11))), + (v2i64 VR128:$src3))), + (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)), + (v2i64 VR128:$src3))), + (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)), + (v4i32 VR128:$src3))), + (VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>; +} + // Instruction where second source can be memory, third must be imm8 multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> { let isCommutable = 1 in @@ -269,159 +290,87 @@ let ExeDomain = SSEPackedInt in { } // Instruction where either second or third source can be memory -multiclass xop4op_int<bits<8> opc, string OpcodeStr, - Intrinsic Int128, Intrinsic Int256> { - // 128-bit Instruction - def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), +multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT> { + def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, - XOP_4V; - def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i128mem:$src3), + [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), + (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V; + def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int128 VR128:$src1, VR128:$src2, - (bitconvert (loadv2i64 addr:$src3))))]>, + [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1), + (X86andnp (load addr:$src3), RC:$src2))))]>, XOP_4V, VEX_W; - def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, VR128:$src3), + def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, - (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), - VR128:$src3))]>, + [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), + (X86andnp RC:$src3, (load addr:$src2)))))]>, XOP_4V; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), + def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V, VEX_W; - - // 256-bit Instruction - def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, - XOP_4V, VEX_L; - def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i256mem:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR256:$dst, - (Int256 VR256:$src1, VR256:$src2, - (bitconvert (loadv4i64 addr:$src3))))]>, - XOP_4V, VEX_W, VEX_L; - def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, VR256:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR256:$dst, - (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)), - VR256:$src3))]>, - XOP_4V, VEX_L; - // For disassembler - let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W, VEX_L; } let ExeDomain = SSEPackedInt in { - defm VPCMOV : xop4op_int<0xA2, "vpcmov", - int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>; + defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>; + defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L; } -let Predicates = [HasXOP] in { - def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), - (X86andnp VR128:$src3, VR128:$src2))), - (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; - - def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), - (X86andnp VR256:$src3, VR256:$src2))), - (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>; -} - -multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType vt128, ValueType vt256, - ValueType id128, ValueType id256, - PatFrag ld_128, PatFrag ld_256> { - def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), +multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, + X86MemOperand intmemop, X86MemOperand fpmemop, + ValueType VT, PatFrag FPLdFrag, + PatFrag IntLdFrag> { + def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - [(set VR128:$dst, - (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), - (id128 VR128:$src3), (i8 imm:$src4))))]>; - def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4), + [(set RC:$dst, + (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>; + def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - [(set VR128:$dst, - (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), - (id128 (bitconvert (loadv2i64 addr:$src3))), - (i8 imm:$src4))))]>, - VEX_W; - def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), + [(set RC:$dst, + (VT (X86vpermil2 RC:$src1, RC:$src2, + (bitconvert (IntLdFrag addr:$src3)), + (i8 imm:$src4))))]>, VEX_W; + def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - [(set VR128:$dst, - (vt128 (OpNode (vt128 VR128:$src1), - (vt128 (bitconvert (ld_128 addr:$src2))), - (id128 VR128:$src3), (i8 imm:$src4))))]>; + [(set RC:$dst, + (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2), + RC:$src3, (i8 imm:$src4))))]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), + def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), []>, VEX_W; - - def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), - !strconcat(OpcodeStr, - "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - [(set VR256:$dst, - (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), - (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; - def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4), - !strconcat(OpcodeStr, - "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - [(set VR256:$dst, - (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), - (id256 (bitconvert (loadv4i64 addr:$src3))), - (i8 imm:$src4))))]>, VEX_W, VEX_L; - def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), - !strconcat(OpcodeStr, - "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - [(set VR256:$dst, - (vt256 (OpNode (vt256 VR256:$src1), - (vt256 (bitconvert (ld_256 addr:$src2))), - (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; - // For disassembler - let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), - !strconcat(OpcodeStr, - "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W, VEX_L; } -let ExeDomain = SSEPackedDouble in - defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64, - v2i64, v4i64, loadv2f64, loadv4f64>; +let ExeDomain = SSEPackedDouble in { + defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem, + v2f64, loadv2f64, loadv2i64>; + defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem, + v4f64, loadv4f64, loadv4i64>, VEX_L; +} -let ExeDomain = SSEPackedSingle in - defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32, - v4i32, v8i32, loadv4f32, loadv8f32>; +let ExeDomain = SSEPackedSingle in { + defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem, + v4f32, loadv4f32, loadv2i64>; + defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem, + v8f32, loadv8f32, loadv4i64>, VEX_L; +} diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp new file mode 100644 index 0000000000000..6cc5e8b635975 --- /dev/null +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -0,0 +1,516 @@ +//===- X86InstructionSelector.cpp ----------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// X86. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86RegisterBankInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "X86-isel" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +namespace { + +class X86InstructionSelector : public InstructionSelector { +public: + X86InstructionSelector(const X86Subtarget &STI, + const X86RegisterBankInfo &RBI); + + bool select(MachineInstr &I) const override; + +private: + /// tblgen-erated 'select' implementation, used as the initial selector for + /// the patterns that don't require complex C++. + bool selectImpl(MachineInstr &I) const; + + // TODO: remove after selectImpl support pattern with a predicate. + unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const; + unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const; + unsigned getAddOp(LLT &Ty, const RegisterBank &RB) const; + unsigned getSubOp(LLT &Ty, const RegisterBank &RB) const; + unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, + uint64_t Alignment) const; + + bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectFrameIndex(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + + const X86Subtarget &STI; + const X86InstrInfo &TII; + const X86RegisterInfo &TRI; + const X86RegisterBankInfo &RBI; + +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "X86GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "X86GenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL + +X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI, + const X86RegisterBankInfo &RBI) + : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI) +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "X86GenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} + +// FIXME: This should be target-independent, inferred from the types declared +// for each class in the bank. +static const TargetRegisterClass * +getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) { + if (RB.getID() == X86::GPRRegBankID) { + if (Ty.getSizeInBits() == 32) + return &X86::GR32RegClass; + if (Ty.getSizeInBits() == 64) + return &X86::GR64RegClass; + } + if (RB.getID() == X86::VECRRegBankID) { + if (Ty.getSizeInBits() == 32) + return &X86::FR32XRegClass; + if (Ty.getSizeInBits() == 64) + return &X86::FR64XRegClass; + if (Ty.getSizeInBits() == 128) + return &X86::VR128XRegClass; + if (Ty.getSizeInBits() == 256) + return &X86::VR256XRegClass; + if (Ty.getSizeInBits() == 512) + return &X86::VR512RegClass; + } + + llvm_unreachable("Unknown RegBank!"); +} + +// Set X86 Opcode and constrain DestReg. +static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, + const RegisterBankInfo &RBI) { + + unsigned DstReg = I.getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { + assert(I.isCopy() && "Generic operators do not allow physical registers"); + return true; + } + + const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); + const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); + (void)DstSize; + unsigned SrcReg = I.getOperand(1).getReg(); + const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + (void)SrcSize; + assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && + "No phys reg on generic operators"); + assert((DstSize == SrcSize || + // Copies are a mean to setup initial types, the number of + // bits may not exactly match. + (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) && + "Copy with different width?!"); + + const TargetRegisterClass *RC = nullptr; + + switch (RegBank.getID()) { + case X86::GPRRegBankID: + assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values."); + RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank); + break; + case X86::VECRRegBankID: + RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank); + break; + default: + llvm_unreachable("Unknown RegBank!"); + } + + // No need to constrain SrcReg. It will get constrained when + // we hit another of its use or its defs. + // Copies do not have constraints. + const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg); + if (!OldRC || !RC->hasSubClassEq(OldRC)) { + if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { + DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) + << " operand\n"); + return false; + } + } + I.setDesc(TII.get(X86::COPY)); + return true; +} + +bool X86InstructionSelector::select(MachineInstr &I) const { + assert(I.getParent() && "Instruction should be in a basic block!"); + assert(I.getParent()->getParent() && "Instruction should be in a function!"); + + MachineBasicBlock &MBB = *I.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned Opcode = I.getOpcode(); + if (!isPreISelGenericOpcode(Opcode)) { + // Certain non-generic instructions also need some special handling. + + if (I.isCopy()) + return selectCopy(I, TII, MRI, TRI, RBI); + + // TODO: handle more cases - LOAD_STACK_GUARD, PHI + return true; + } + + assert(I.getNumOperands() == I.getNumExplicitOperands() && + "Generic instruction has unexpected implicit operands\n"); + + // TODO: This should be implemented by tblgen, pattern with predicate not + // supported yet. + if (selectBinaryOp(I, MRI, MF)) + return true; + if (selectLoadStoreOp(I, MRI, MF)) + return true; + if (selectFrameIndex(I, MRI, MF)) + return true; + if (selectConstant(I, MRI, MF)) + return true; + + return selectImpl(I); +} + +unsigned X86InstructionSelector::getFAddOp(LLT &Ty, + const RegisterBank &RB) const { + + if (X86::VECRRegBankID != RB.getID()) + return TargetOpcode::G_FADD; + + if (Ty == LLT::scalar(32)) { + if (STI.hasAVX512()) { + return X86::VADDSSZrr; + } else if (STI.hasAVX()) { + return X86::VADDSSrr; + } else if (STI.hasSSE1()) { + return X86::ADDSSrr; + } + } else if (Ty == LLT::scalar(64)) { + if (STI.hasAVX512()) { + return X86::VADDSDZrr; + } else if (STI.hasAVX()) { + return X86::VADDSDrr; + } else if (STI.hasSSE2()) { + return X86::ADDSDrr; + } + } else if (Ty == LLT::vector(4, 32)) { + if ((STI.hasAVX512()) && (STI.hasVLX())) { + return X86::VADDPSZ128rr; + } else if (STI.hasAVX()) { + return X86::VADDPSrr; + } else if (STI.hasSSE1()) { + return X86::ADDPSrr; + } + } + + return TargetOpcode::G_FADD; +} + +unsigned X86InstructionSelector::getFSubOp(LLT &Ty, + const RegisterBank &RB) const { + + if (X86::VECRRegBankID != RB.getID()) + return TargetOpcode::G_FSUB; + + if (Ty == LLT::scalar(32)) { + if (STI.hasAVX512()) { + return X86::VSUBSSZrr; + } else if (STI.hasAVX()) { + return X86::VSUBSSrr; + } else if (STI.hasSSE1()) { + return X86::SUBSSrr; + } + } else if (Ty == LLT::scalar(64)) { + if (STI.hasAVX512()) { + return X86::VSUBSDZrr; + } else if (STI.hasAVX()) { + return X86::VSUBSDrr; + } else if (STI.hasSSE2()) { + return X86::SUBSDrr; + } + } else if (Ty == LLT::vector(4, 32)) { + if ((STI.hasAVX512()) && (STI.hasVLX())) { + return X86::VSUBPSZ128rr; + } else if (STI.hasAVX()) { + return X86::VSUBPSrr; + } else if (STI.hasSSE1()) { + return X86::SUBPSrr; + } + } + + return TargetOpcode::G_FSUB; +} + +unsigned X86InstructionSelector::getAddOp(LLT &Ty, + const RegisterBank &RB) const { + + if (X86::VECRRegBankID != RB.getID()) + return TargetOpcode::G_ADD; + + if (Ty == LLT::vector(4, 32)) { + if (STI.hasAVX512() && STI.hasVLX()) { + return X86::VPADDDZ128rr; + } else if (STI.hasAVX()) { + return X86::VPADDDrr; + } else if (STI.hasSSE2()) { + return X86::PADDDrr; + } + } + + return TargetOpcode::G_ADD; +} + +unsigned X86InstructionSelector::getSubOp(LLT &Ty, + const RegisterBank &RB) const { + + if (X86::VECRRegBankID != RB.getID()) + return TargetOpcode::G_SUB; + + if (Ty == LLT::vector(4, 32)) { + if (STI.hasAVX512() && STI.hasVLX()) { + return X86::VPSUBDZ128rr; + } else if (STI.hasAVX()) { + return X86::VPSUBDrr; + } else if (STI.hasSSE2()) { + return X86::PSUBDrr; + } + } + + return TargetOpcode::G_SUB; +} + +bool X86InstructionSelector::selectBinaryOp(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + + const unsigned DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + unsigned NewOpc = I.getOpcode(); + + switch (NewOpc) { + case TargetOpcode::G_FADD: + NewOpc = getFAddOp(Ty, RB); + break; + case TargetOpcode::G_FSUB: + NewOpc = getFSubOp(Ty, RB); + break; + case TargetOpcode::G_ADD: + NewOpc = getAddOp(Ty, RB); + break; + case TargetOpcode::G_SUB: + NewOpc = getSubOp(Ty, RB); + break; + default: + break; + } + + if (NewOpc == I.getOpcode()) + return false; + + I.setDesc(TII.get(NewOpc)); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, + unsigned Opc, + uint64_t Alignment) const { + bool Isload = (Opc == TargetOpcode::G_LOAD); + bool HasAVX = STI.hasAVX(); + bool HasAVX512 = STI.hasAVX512(); + bool HasVLX = STI.hasVLX(); + + if (Ty == LLT::scalar(8)) { + if (X86::GPRRegBankID == RB.getID()) + return Isload ? X86::MOV8rm : X86::MOV8mr; + } else if (Ty == LLT::scalar(16)) { + if (X86::GPRRegBankID == RB.getID()) + return Isload ? X86::MOV16rm : X86::MOV16mr; + } else if (Ty == LLT::scalar(32)) { + if (X86::GPRRegBankID == RB.getID()) + return Isload ? X86::MOV32rm : X86::MOV32mr; + if (X86::VECRRegBankID == RB.getID()) + return Isload ? (HasAVX512 ? X86::VMOVSSZrm + : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) + : (HasAVX512 ? X86::VMOVSSZmr + : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + } else if (Ty == LLT::scalar(64)) { + if (X86::GPRRegBankID == RB.getID()) + return Isload ? X86::MOV64rm : X86::MOV64mr; + if (X86::VECRRegBankID == RB.getID()) + return Isload ? (HasAVX512 ? X86::VMOVSDZrm + : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) + : (HasAVX512 ? X86::VMOVSDZmr + : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + } else if (Ty.isVector() && Ty.getSizeInBits() == 128) { + if (Alignment >= 16) + return Isload ? (HasVLX ? X86::VMOVAPSZ128rm + : HasAVX512 + ? X86::VMOVAPSZ128rm_NOVLX + : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) + : (HasVLX ? X86::VMOVAPSZ128mr + : HasAVX512 + ? X86::VMOVAPSZ128mr_NOVLX + : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + else + return Isload ? (HasVLX ? X86::VMOVUPSZ128rm + : HasAVX512 + ? X86::VMOVUPSZ128rm_NOVLX + : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) + : (HasVLX ? X86::VMOVUPSZ128mr + : HasAVX512 + ? X86::VMOVUPSZ128mr_NOVLX + : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } + return Opc; +} + +bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + + unsigned Opc = I.getOpcode(); + + if (Opc != TargetOpcode::G_STORE && Opc != TargetOpcode::G_LOAD) + return false; + + const unsigned DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); + + auto &MemOp = **I.memoperands_begin(); + unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment()); + if (NewOpc == Opc) + return false; + + I.setDesc(TII.get(NewOpc)); + MachineInstrBuilder MIB(MF, I); + if (Opc == TargetOpcode::G_LOAD) + addOffset(MIB, 0); + else { + // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL) + I.RemoveOperand(0); + addOffset(MIB, 0).addUse(DefReg); + } + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool X86InstructionSelector::selectFrameIndex(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_FRAME_INDEX) + return false; + + const unsigned DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + + // Use LEA to calculate frame index. + unsigned NewOpc; + if (Ty == LLT::pointer(0, 64)) + NewOpc = X86::LEA64r; + else if (Ty == LLT::pointer(0, 32)) + NewOpc = STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r; + else + llvm_unreachable("Can't select G_FRAME_INDEX, unsupported type."); + + I.setDesc(TII.get(NewOpc)); + MachineInstrBuilder MIB(MF, I); + addOffset(MIB, 0); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +bool X86InstructionSelector::selectConstant(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_CONSTANT) + return false; + + const unsigned DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + + assert(Ty.isScalar() && "invalid element type."); + + uint64_t Val = 0; + if (I.getOperand(1).isCImm()) { + Val = I.getOperand(1).getCImm()->getZExtValue(); + I.getOperand(1).ChangeToImmediate(Val); + } else if (I.getOperand(1).isImm()) { + Val = I.getOperand(1).getImm(); + } else + llvm_unreachable("Unsupported operand type."); + + unsigned NewOpc; + switch (Ty.getSizeInBits()) { + case 8: + NewOpc = X86::MOV8ri; + break; + case 16: + NewOpc = X86::MOV16ri; + break; + case 32: + NewOpc = X86::MOV32ri; + break; + case 64: { + // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used + if (isInt<32>(Val)) + NewOpc = X86::MOV64ri32; + else + NewOpc = X86::MOV64ri; + break; + } + default: + llvm_unreachable("Can't select G_CONSTANT, unsupported type."); + } + + I.setDesc(TII.get(NewOpc)); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + +InstructionSelector * +llvm::createX86InstructionSelector(X86Subtarget &Subtarget, + X86RegisterBankInfo &RBI) { + return new X86InstructionSelector(Subtarget, RBI); +} diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp index d9edf4676faf8..806d6cc888f0f 100644 --- a/lib/Target/X86/X86InterleavedAccess.cpp +++ b/lib/Target/X86/X86InterleavedAccess.cpp @@ -19,6 +19,7 @@ using namespace llvm; +namespace { /// \brief This class holds necessary information to represent an interleaved /// access group and supports utilities to lower the group into /// X86-specific instructions/intrinsics. @@ -27,7 +28,6 @@ using namespace llvm; /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr /// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6> /// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7> - class X86InterleavedAccessGroup { /// \brief Reference to the wide-load instruction of an interleaved access /// group. @@ -95,6 +95,7 @@ public: /// instructions/intrinsics. bool lowerIntoOptimizedSequence(); }; +} // end anonymous namespace bool X86InterleavedAccessGroup::isSupported() const { VectorType *ShuffleVecTy = Shuffles[0]->getType(); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 63a02af02faad..2a40399ba5712 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t { TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK + FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, }; struct IntrinsicData { @@ -67,6 +67,23 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0), + X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0), + X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), @@ -325,6 +342,8 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), + X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0), X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0), @@ -351,9 +370,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, ISD::ABS, 0), X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), @@ -421,18 +440,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), @@ -455,18 +462,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0), + X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), - + X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FADD_RND, 0), + X86ISD::FADDS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FADD_RND, 0), + X86ISD::FADDS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC, X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC, @@ -720,9 +729,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FDIV_RND, 0), + X86ISD::FDIVS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FDIV_RND, 0), + X86ISD::FDIVS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, @@ -795,74 +804,42 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VGETMANTS, 0), X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, X86ISD::VGETMANTS, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), - X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMAX_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMAX_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMAXS, X86ISD::FMAXS_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMAXS, X86ISD::FMAXS_RND), X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), - X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMIN_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMIN_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMINS, X86ISD::FMINS_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK, + X86ISD::FMINS, X86ISD::FMINS_RND), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMUL_RND, 0), + X86ISD::FMULS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FMUL_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), - X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0), + X86ISD::FMULS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), @@ -1191,9 +1168,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSUB_RND, 0), + X86ISD::FSUBS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSUB_RND, 0), + X86ISD::FSUBS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), @@ -1486,6 +1463,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ, X86ISD::VPMADD52L, 0), + X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), @@ -1613,6 +1594,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), @@ -1620,7 +1602,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse_max_ss, INTR_TYPE_2OP, X86ISD::FMAXS, 0), X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0), X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), @@ -1631,6 +1615,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE), X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT), X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse2_cmp_pd, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), @@ -1643,7 +1628,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0), X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0), X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse2_max_sd, INTR_TYPE_2OP, X86ISD::FMAXS, 0), X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse2_min_sd, INTR_TYPE_2OP, X86ISD::FMINS, 0), X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), @@ -1696,9 +1683,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), - X86_INTRINSIC_DATA(ssse3_pabs_b_128, INTR_TYPE_1OP, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(ssse3_pabs_d_128, INTR_TYPE_1OP, X86ISD::ABS, 0), - X86_INTRINSIC_DATA(ssse3_pabs_w_128, INTR_TYPE_1OP, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(ssse3_pabs_b_128, INTR_TYPE_1OP, ISD::ABS, 0), + X86_INTRINSIC_DATA(ssse3_pabs_d_128, INTR_TYPE_1OP, ISD::ABS, 0), + X86_INTRINSIC_DATA(ssse3_pabs_w_128, INTR_TYPE_1OP, ISD::ABS, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp new file mode 100644 index 0000000000000..c2dc762fec5eb --- /dev/null +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -0,0 +1,142 @@ +//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for X86. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "X86LegalizerInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Type.h" +#include "llvm/Target/TargetOpcodes.h" + +using namespace llvm; +using namespace TargetOpcode; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, + const X86TargetMachine &TM) + : Subtarget(STI), TM(TM) { + + setLegalizerInfo32bit(); + setLegalizerInfo64bit(); + setLegalizerInfoSSE1(); + setLegalizerInfoSSE2(); + + computeTables(); +} + +void X86LegalizerInfo::setLegalizerInfo32bit() { + + if (Subtarget.is64Bit()) + return; + + const LLT p0 = LLT::pointer(0, 32); + const LLT s1 = LLT::scalar(1); + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {s8, s16, s32}) + setAction({BinOp, Ty}, Legal); + + for (unsigned MemOp : {G_LOAD, G_STORE}) { + for (auto Ty : {s8, s16, s32, p0}) + setAction({MemOp, Ty}, Legal); + + // And everything's fine in addrspace 0. + setAction({MemOp, 1, p0}, Legal); + } + + // Pointer-handling + setAction({G_FRAME_INDEX, p0}, Legal); + + // Constants + for (auto Ty : {s8, s16, s32, p0}) + setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); + + setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar); + setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar); +} + +void X86LegalizerInfo::setLegalizerInfo64bit() { + + if (!Subtarget.is64Bit()) + return; + + const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8); + const LLT s1 = LLT::scalar(1); + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {s8, s16, s32, s64}) + setAction({BinOp, Ty}, Legal); + + for (unsigned MemOp : {G_LOAD, G_STORE}) { + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({MemOp, Ty}, Legal); + + // And everything's fine in addrspace 0. + setAction({MemOp, 1, p0}, Legal); + } + + // Pointer-handling + setAction({G_FRAME_INDEX, p0}, Legal); + + // Constants + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); + + setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar); +} + +void X86LegalizerInfo::setLegalizerInfoSSE1() { + if (!Subtarget.hasSSE1()) + return; + + const LLT s32 = LLT::scalar(32); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); + + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) + for (auto Ty : {s32, v4s32}) + setAction({BinOp, Ty}, Legal); + + for (unsigned MemOp : {G_LOAD, G_STORE}) + for (auto Ty : {v4s32, v2s64}) + setAction({MemOp, Ty}, Legal); +} + +void X86LegalizerInfo::setLegalizerInfoSSE2() { + if (!Subtarget.hasSSE2()) + return; + + const LLT s64 = LLT::scalar(64); + const LLT v4s32 = LLT::vector(4, 32); + const LLT v2s64 = LLT::vector(2, 64); + + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) + for (auto Ty : {s64, v2s64}) + setAction({BinOp, Ty}, Legal); + + for (unsigned BinOp : {G_ADD, G_SUB}) + for (auto Ty : {v4s32}) + setAction({BinOp, Ty}, Legal); +} diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h new file mode 100644 index 0000000000000..3f00898b42322 --- /dev/null +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -0,0 +1,43 @@ +//===- X86LegalizerInfo.h ------------------------------------------*- C++ +//-*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for X86. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H +#define LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class X86Subtarget; +class X86TargetMachine; + +/// This class provides the information for the target register banks. +class X86LegalizerInfo : public LegalizerInfo { +private: + /// Keep a reference to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget &Subtarget; + const X86TargetMachine &TM; + +public: + X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM); + +private: + void setLegalizerInfo32bit(); + void setLegalizerInfo64bit(); + void setLegalizerInfoSSE1(); + void setLegalizerInfoSSE2(); +}; +} // namespace llvm +#endif diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index feeb2fd5993c8..550e3543a71e8 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -102,7 +102,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -215,6 +215,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break; case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break; case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break; + case X86II::MO_ABS8: RefKind = MCSymbolRefExpr::VK_X86_ABS8; break; case X86II::MO_PIC_BASE_OFFSET: case X86II::MO_DARWIN_NONLAZY_PIC_BASE: Expr = MCSymbolRefExpr::create(Sym, Ctx); @@ -357,7 +358,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, const MachineOperand &MO) const { switch (MO.getType()) { default: - MI->dump(); + MI->print(errs()); llvm_unreachable("unknown operand type"); case MachineOperand::MO_Register: // Ignore all implicit register operands. @@ -498,11 +499,16 @@ ReSimplify: break; } - // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction. + // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction. { unsigned Opcode; case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode; case X86::TAILJMPd: case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode; + case X86::TAILJMPd_CC: + case X86::TAILJMPd64_CC: + Opcode = X86::GetCondBranchFromCond( + static_cast<X86::CondCode>(MI->getOperand(1).getImm())); + goto SetTailJmpOpcode; SetTailJmpOpcode: MCOperand Saved = OutMI.getOperand(0); @@ -888,30 +894,47 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, SM.recordStatepoint(MI); } -void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI, - X86MCInstLower &MCIL) { - // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands> +void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, + X86MCInstLower &MCIL) { + // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>, + // <opcode>, <operands> - unsigned LoadDefRegister = MI.getOperand(0).getReg(); - MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol(); - unsigned LoadOpcode = MI.getOperand(2).getImm(); - unsigned LoadOperandsBeginIdx = 3; + unsigned DefRegister = FaultingMI.getOperand(0).getReg(); + FaultMaps::FaultKind FK = + static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm()); + MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol(); + unsigned Opcode = FaultingMI.getOperand(3).getImm(); + unsigned OperandsBeginIdx = 4; - FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel); + assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!"); + FM.recordFaultingOp(FK, HandlerLabel); - MCInst LoadMI; - LoadMI.setOpcode(LoadOpcode); + MCInst MI; + MI.setOpcode(Opcode); - if (LoadDefRegister != X86::NoRegister) - LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + if (DefRegister != X86::NoRegister) + MI.addOperand(MCOperand::createReg(DefRegister)); - for (auto I = MI.operands_begin() + LoadOperandsBeginIdx, - E = MI.operands_end(); + for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx, + E = FaultingMI.operands_end(); I != E; ++I) - if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I)) - LoadMI.addOperand(MaybeOperand.getValue()); + if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I)) + MI.addOperand(MaybeOperand.getValue()); + + OutStreamer->EmitInstruction(MI, getSubtargetInfo()); +} - OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo()); +void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI, + X86MCInstLower &MCIL) { + bool Is64Bits = Subtarget->is64Bit(); + MCContext &Ctx = OutStreamer->getContext(); + MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__"); + const MCSymbolRefExpr *Op = + MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx); + + EmitAndCountInstruction( + MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32) + .addExpr(Op)); } void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, @@ -1276,9 +1299,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TAILJMPr: case X86::TAILJMPm: case X86::TAILJMPd: + case X86::TAILJMPd_CC: case X86::TAILJMPr64: case X86::TAILJMPm64: case X86::TAILJMPd64: + case X86::TAILJMPd64_CC: case X86::TAILJMPr64_REX: case X86::TAILJMPm64_REX: // Lower these as normal, but add some comments. @@ -1367,8 +1392,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case TargetOpcode::STATEPOINT: return LowerSTATEPOINT(*MI, MCInstLowering); - case TargetOpcode::FAULTING_LOAD_OP: - return LowerFAULTING_LOAD_OP(*MI, MCInstLowering); + case TargetOpcode::FAULTING_OP: + return LowerFAULTING_OP(*MI, MCInstLowering); + + case TargetOpcode::FENTRY_CALL: + return LowerFENTRY_CALL(*MI, MCInstLowering); case TargetOpcode::PATCHABLE_OP: return LowerPATCHABLE_OP(*MI, MCInstLowering); @@ -1501,7 +1529,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 64> Mask; DecodePSHUFBMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), + !EnablePrintSchedInfo); } break; } @@ -1572,15 +1601,16 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask), + !EnablePrintSchedInfo); } break; } case X86::VPERMIL2PDrm: case X86::VPERMIL2PSrm: - case X86::VPERMIL2PDrmY: - case X86::VPERMIL2PSrmY: { + case X86::VPERMIL2PDYrm: + case X86::VPERMIL2PSYrm: { if (!OutStreamer->isVerboseAsm()) break; assert(MI->getNumOperands() >= 8 && @@ -1593,8 +1623,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { unsigned ElSize; switch (MI->getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break; - case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break; + case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break; + case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break; } const MachineOperand &MaskOp = MI->getOperand(6); @@ -1602,7 +1632,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), + !EnablePrintSchedInfo); } break; } @@ -1618,7 +1649,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPPERMMask(C, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask), + !EnablePrintSchedInfo); } break; } @@ -1678,7 +1710,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { CS << "?"; } CS << "]"; - OutStreamer->AddComment(CS.str()); + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } else if (auto *CV = dyn_cast<ConstantVector>(C)) { CS << "<"; for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { @@ -1710,7 +1742,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << ">"; - OutStreamer->AddComment(CS.str()); + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } } break; diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp index c9e636f1eb00b..3fcb642424adc 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -9,6 +9,7 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -20,11 +21,8 @@ void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( MF->getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); - for (const MCPhysReg *CSR = - RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF); - unsigned Reg = *CSR; - ++CSR) - { + for (const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs(); + unsigned Reg = *CSR; ++CSR) { if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) RestoreBasePointerOffset -= SlotSize; } diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp new file mode 100644 index 0000000000000..dd21e2b7c4a13 --- /dev/null +++ b/lib/Target/X86/X86MacroFusion.cpp @@ -0,0 +1,271 @@ +//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// \file This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the DAG scheduling mutation to +// pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "X86MacroFusion.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define DEBUG_TYPE "misched" + +STATISTIC(NumFused, "Number of instr pairs fused"); + +using namespace llvm; + +static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden, + cl::desc("Enable scheduling for macro fusion."), cl::init(true)); + +namespace { + +/// \brief Verify that the instruction pair, First and Second, +/// should be scheduled back to back. If either instruction is unspecified, +/// then verify that the other instruction may be part of a pair at all. +static bool shouldScheduleAdjacent(const X86Subtarget &ST, + const MachineInstr *First, + const MachineInstr *Second) { + // Check if this processor supports macro-fusion. Since this is a minor + // heuristic, we haven't specifically reserved a feature. hasAVX is a decent + // proxy for SandyBridge+. + if (!ST.hasAVX()) + return false; + + enum { + FuseTest, + FuseCmp, + FuseInc + } FuseKind; + + assert((First || Second) && "At least one instr must be specified"); + unsigned FirstOpcode = First + ? First->getOpcode() + : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); + unsigned SecondOpcode = Second + ? Second->getOpcode() + : static_cast<unsigned>(X86::INSTRUCTION_LIST_END); + + switch (SecondOpcode) { + default: + return false; + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: + FuseKind = FuseInc; + break; + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: + FuseKind = FuseCmp; + break; + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: + FuseKind = FuseTest; + break; + } + + switch (FirstOpcode) { + default: + return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + case X86::TEST8ri: + case X86::TEST16ri: + case X86::TEST32ri: + case X86::TEST32i32: + case X86::TEST64i32: + case X86::TEST64ri32: + case X86::TEST8rm: + case X86::TEST16rm: + case X86::TEST32rm: + case X86::TEST64rm: + case X86::TEST8ri_NOREX: + case X86::AND16i16: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND16rm: + case X86::AND16rr: + case X86::AND32i32: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND32rm: + case X86::AND32rr: + case X86::AND64i32: + case X86::AND64ri32: + case X86::AND64ri8: + case X86::AND64rm: + case X86::AND64rr: + case X86::AND8i8: + case X86::AND8ri: + case X86::AND8rm: + case X86::AND8rr: + return true; + case X86::CMP16i16: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP16rm: + case X86::CMP16rr: + case X86::CMP32i32: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP32rm: + case X86::CMP32rr: + case X86::CMP64i32: + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP64rm: + case X86::CMP64rr: + case X86::CMP8i8: + case X86::CMP8ri: + case X86::CMP8rm: + case X86::CMP8rr: + case X86::ADD16i16: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri8_DB: + case X86::ADD16ri_DB: + case X86::ADD16rm: + case X86::ADD16rr: + case X86::ADD16rr_DB: + case X86::ADD32i32: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri8_DB: + case X86::ADD32ri_DB: + case X86::ADD32rm: + case X86::ADD32rr: + case X86::ADD32rr_DB: + case X86::ADD64i32: + case X86::ADD64ri32: + case X86::ADD64ri32_DB: + case X86::ADD64ri8: + case X86::ADD64ri8_DB: + case X86::ADD64rm: + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD8i8: + case X86::ADD8mi: + case X86::ADD8mr: + case X86::ADD8ri: + case X86::ADD8rm: + case X86::ADD8rr: + case X86::SUB16i16: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB16rm: + case X86::SUB16rr: + case X86::SUB32i32: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB32rm: + case X86::SUB32rr: + case X86::SUB64i32: + case X86::SUB64ri32: + case X86::SUB64ri8: + case X86::SUB64rm: + case X86::SUB64rr: + case X86::SUB8i8: + case X86::SUB8ri: + case X86::SUB8rm: + case X86::SUB8rr: + return FuseKind == FuseCmp || FuseKind == FuseInc; + case X86::INC16r: + case X86::INC32r: + case X86::INC64r: + case X86::INC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64r: + case X86::DEC8r: + return FuseKind == FuseInc; + case X86::INSTRUCTION_LIST_END: + return true; + } +} + +/// \brief Post-process the DAG to create cluster edges between instructions +/// that may be fused by the processor into a single operation. +class X86MacroFusion : public ScheduleDAGMutation { +public: + X86MacroFusion() {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; +}; + +void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>(); + + // For now, assume targets can only fuse with the branch. + SUnit &ExitSU = DAG->ExitSU; + MachineInstr *Branch = ExitSU.getInstr(); + if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch)) + return; + + for (SDep &PredDep : ExitSU.Preds) { + if (PredDep.isWeak()) + continue; + SUnit &SU = *PredDep.getSUnit(); + MachineInstr &Pred = *SU.getInstr(); + if (!shouldScheduleAdjacent(ST, &Pred, Branch)) + continue; + + // Create a single weak edge from SU to ExitSU. The only effect is to cause + // bottom-up scheduling to heavily prioritize the clustered SU. There is no + // need to copy predecessor edges from ExitSU to SU, since top-down + // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling + // of SU, we could create an artificial edge from the deepest root, but it + // hasn't been needed yet. + bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster)); + (void)Success; + assert(Success && "No DAG nodes should be reachable from ExitSU"); + + // Adjust latency of data deps between the nodes. + for (SDep &PredDep : ExitSU.Preds) + if (PredDep.getSUnit() == &SU) + PredDep.setLatency(0); + for (SDep &SuccDep : SU.Succs) + if (SuccDep.getSUnit() == &ExitSU) + SuccDep.setLatency(0); + + ++NumFused; + DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse "; + SU.print(dbgs(), DAG); + dbgs() << " - ExitSU" + << " / " << DAG->TII->getName(Pred.getOpcode()) << " - " + << DAG->TII->getName(Branch->getOpcode()) << '\n';); + + break; + } +} + +} // end namespace + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> +createX86MacroFusionDAGMutation () { + return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr; +} + +} // end namespace llvm diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h new file mode 100644 index 0000000000000..e630f802e8e63 --- /dev/null +++ b/lib/Target/X86/X86MacroFusion.h @@ -0,0 +1,30 @@ +//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// \file This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 definition of the DAG scheduling mutation to pair +// instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" + +//===----------------------------------------------------------------------===// +// X86MacroFusion - DAG post-processing to encourage fusion of macro ops. +//===----------------------------------------------------------------------===// + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createX86MacroFusionDAGMutation()); +/// to X86PassConfig::createMachineScheduler() to have an effect. +std::unique_ptr<ScheduleDAGMutation> +createX86MacroFusionDAGMutation(); + +} // end namespace llvm diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp index e1447006cd18c..debb192732e5c 100644 --- a/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -389,9 +389,6 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, assert(isLEA(First) && isLEA(Last) && "The function works only with LEA instructions"); - // Get new address displacement. - AddrDispShift = getAddrDispShift(Last, 1, First, 1); - // Make sure that LEA def registers belong to the same class. There may be // instructions (like MOV8mr_NOREX) which allow a limited set of registers to // be used as their operands, so we must be sure that replacing one LEA @@ -400,10 +397,13 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, MRI->getRegClass(Last.getOperand(0).getReg())) return false; + // Get new address displacement. + AddrDispShift = getAddrDispShift(Last, 1, First, 1); + // Loop over all uses of the Last LEA to check that its def register is // used only as address base for memory accesses. If so, it can be // replaced, otherwise - no. - for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) { + for (auto &MO : MRI->use_nodbg_operands(Last.getOperand(0).getReg())) { MachineInstr &MI = *MO.getParent(); // Get the number of the first memory operand. @@ -563,8 +563,9 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { // Loop over all uses of the Last LEA and update their operands. Note // that the correctness of this has already been checked in the // isReplaceable function. - for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()), - UE = MRI->use_end(); + unsigned LastVReg = Last.getOperand(0).getReg(); + for (auto UI = MRI->use_nodbg_begin(LastVReg), + UE = MRI->use_nodbg_end(); UI != UE;) { MachineOperand &MO = *UI++; MachineInstr &MI = *MO.getParent(); @@ -586,6 +587,9 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { Op.setOffset(Op.getOffset() + AddrDispShift); } + // Mark debug values referring to Last LEA as undefined. + MRI->markUsesInDebugValueAsUndef(LastVReg); + // Since we can possibly extend register lifetime, clear kill flags. MRI->clearKillFlags(First.getOperand(0).getReg()); @@ -594,7 +598,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { // By this moment, all of the Last LEA's uses must be replaced. So we // can freely remove it. - assert(MRI->use_empty(Last.getOperand(0).getReg()) && + assert(MRI->use_empty(LastVReg) && "The LEA's def register must have no uses"); Last.eraseFromParent(); diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp new file mode 100644 index 0000000000000..d395c826e6bf7 --- /dev/null +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -0,0 +1,243 @@ +//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for X86. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#include "X86RegisterBankInfo.h" +#include "X86InstrInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_TARGET_REGBANK_IMPL +#include "X86GenRegisterBank.inc" + +using namespace llvm; +// This file will be TableGen'ed at some point. +#define GET_TARGET_REGBANK_INFO_IMPL +#include "X86GenRegisterBankInfo.def" + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "You shouldn't build this" +#endif + +X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) + : X86GenRegisterBankInfo() { + + // validate RegBank initialization. + const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID); + (void)RBGPR; + assert(&X86::GPRRegBank == &RBGPR && "Incorrect RegBanks inizalization."); + + // The GPR register bank is fully defined by all the registers in + // GR64 + its subclasses. + assert(RBGPR.covers(*TRI.getRegClass(X86::GR64RegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit"); +} + +const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass( + const TargetRegisterClass &RC) const { + + if (X86::GR8RegClass.hasSubClassEq(&RC) || + X86::GR16RegClass.hasSubClassEq(&RC) || + X86::GR32RegClass.hasSubClassEq(&RC) || + X86::GR64RegClass.hasSubClassEq(&RC)) + return getRegBank(X86::GPRRegBankID); + + if (X86::FR32XRegClass.hasSubClassEq(&RC) || + X86::FR64XRegClass.hasSubClassEq(&RC) || + X86::VR128XRegClass.hasSubClassEq(&RC) || + X86::VR256XRegClass.hasSubClassEq(&RC) || + X86::VR512RegClass.hasSubClassEq(&RC)) + return getRegBank(X86::VECRRegBankID); + + llvm_unreachable("Unsupported register kind yet."); +} + +X86GenRegisterBankInfo::PartialMappingIdx +X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) { + if ((Ty.isScalar() && !isFP) || Ty.isPointer()) { + switch (Ty.getSizeInBits()) { + case 8: + return PMI_GPR8; + case 16: + return PMI_GPR16; + case 32: + return PMI_GPR32; + case 64: + return PMI_GPR64; + break; + default: + llvm_unreachable("Unsupported register size."); + } + } else if (Ty.isScalar()) { + switch (Ty.getSizeInBits()) { + case 32: + return PMI_FP32; + case 64: + return PMI_FP64; + default: + llvm_unreachable("Unsupported register size."); + } + } else { + switch (Ty.getSizeInBits()) { + case 128: + return PMI_VEC128; + case 256: + return PMI_VEC256; + case 512: + return PMI_VEC512; + default: + llvm_unreachable("Unsupported register size."); + } + } + + return PMI_None; +} + +void X86RegisterBankInfo::getInstrPartialMappingIdxs( + const MachineInstr &MI, const MachineRegisterInfo &MRI, const bool isFP, + SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx) { + + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + auto &MO = MI.getOperand(Idx); + if (!MO.isReg()) + OpRegBankIdx[Idx] = PMI_None; + else + OpRegBankIdx[Idx] = getPartialMappingIdx(MRI.getType(MO.getReg()), isFP); + } +} + +bool X86RegisterBankInfo::getInstrValueMapping( + const MachineInstr &MI, + const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx, + SmallVectorImpl<const ValueMapping *> &OpdsMapping) { + + unsigned NumOperands = MI.getNumOperands(); + for (unsigned Idx = 0; Idx < NumOperands; ++Idx) { + if (!MI.getOperand(Idx).isReg()) + continue; + + auto Mapping = getValueMapping(OpRegBankIdx[Idx], 1); + if (!Mapping->isValid()) + return false; + + OpdsMapping[Idx] = Mapping; + } + return true; +} + +RegisterBankInfo::InstructionMapping +X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, bool isFP) { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumOperands = MI.getNumOperands(); + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + + if (NumOperands != 3 || (Ty != MRI.getType(MI.getOperand(1).getReg())) || + (Ty != MRI.getType(MI.getOperand(2).getReg()))) + llvm_unreachable("Unsupported operand mapping yet."); + + auto Mapping = getValueMapping(getPartialMappingIdx(Ty, isFP), 3); + return InstructionMapping{DefaultMappingID, 1, Mapping, NumOperands}; +} + +RegisterBankInfo::InstructionMapping +X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + auto Opc = MI.getOpcode(); + + // Try the default logic for non-generic instructions that are either copies + // or already have some operands assigned to banks. + if (!isPreISelGenericOpcode(Opc)) { + InstructionMapping Mapping = getInstrMappingImpl(MI); + if (Mapping.isValid()) + return Mapping; + } + + switch (Opc) { + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + return getSameOperandsMapping(MI, false); + break; + case TargetOpcode::G_FADD: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FDIV: + return getSameOperandsMapping(MI, true); + break; + default: + break; + } + + unsigned NumOperands = MI.getNumOperands(); + + // Track the bank of each register, use NotFP mapping (all scalars in GPRs) + SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); + getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx); + + // Finally construct the computed mapping. + SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); + if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping)) + return InstructionMapping(); + + return InstructionMapping{DefaultMappingID, /* Cost */ 1, + getOperandsMapping(OpdsMapping), NumOperands}; +} + +void X86RegisterBankInfo::applyMappingImpl( + const OperandsMapper &OpdMapper) const { + return applyDefaultMapping(OpdMapper); +} + +RegisterBankInfo::InstructionMappings +X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const { + + const MachineFunction &MF = *MI.getParent()->getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: + case TargetOpcode::G_STORE: { + // we going to try to map 32/64 bit to PMI_FP32/PMI_FP64 + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + if (Size != 32 && Size != 64) + break; + + unsigned NumOperands = MI.getNumOperands(); + + // Track the bank of each register, use FP mapping (all scalars in VEC) + SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); + getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx); + + // Finally construct the computed mapping. + SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); + if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping)) + break; + + RegisterBankInfo::InstructionMapping Mapping = InstructionMapping{ + /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands}; + InstructionMappings AltMappings; + AltMappings.emplace_back(std::move(Mapping)); + return AltMappings; + } + default: + break; + } + return RegisterBankInfo::getInstrAlternativeMappings(MI); +} diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h new file mode 100644 index 0000000000000..a1e01a9ab9497 --- /dev/null +++ b/lib/Target/X86/X86RegisterBankInfo.h @@ -0,0 +1,81 @@ +//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for X86. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H +#define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" + +#define GET_REGBANK_DECLARATIONS +#include "X86GenRegisterBank.inc" + +namespace llvm { + +class LLT; + +class X86GenRegisterBankInfo : public RegisterBankInfo { +protected: +#define GET_TARGET_REGBANK_CLASS +#include "X86GenRegisterBank.inc" +#define GET_TARGET_REGBANK_INFO_CLASS +#include "X86GenRegisterBankInfo.def" + + static RegisterBankInfo::PartialMapping PartMappings[]; + static RegisterBankInfo::ValueMapping ValMappings[]; + + static PartialMappingIdx getPartialMappingIdx(const LLT &Ty, bool isFP); + static const RegisterBankInfo::ValueMapping * + getValueMapping(PartialMappingIdx Idx, unsigned NumOperands); +}; + +class TargetRegisterInfo; + +/// This class provides the information for the target register banks. +class X86RegisterBankInfo final : public X86GenRegisterBankInfo { +private: + /// Get an instruction mapping. + /// \return An InstructionMappings with a statically allocated + /// OperandsMapping. + static InstructionMapping getSameOperandsMapping(const MachineInstr &MI, + bool isFP); + + /// Track the bank of each instruction operand(register) + static void + getInstrPartialMappingIdxs(const MachineInstr &MI, + const MachineRegisterInfo &MRI, const bool isFP, + SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx); + + /// Construct the instruction ValueMapping from PartialMappingIdxs + /// \return true if mapping succeeded. + static bool + getInstrValueMapping(const MachineInstr &MI, + const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx, + SmallVectorImpl<const ValueMapping *> &OpdsMapping); + +public: + X86RegisterBankInfo(const TargetRegisterInfo &TRI); + + const RegisterBank & + getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + + InstructionMappings + getInstrAlternativeMappings(const MachineInstr &MI) const override; + + /// See RegisterBankInfo::applyMapping. + void applyMappingImpl(const OperandsMapper &OpdMapper) const override; + + InstructionMapping getInstrMapping(const MachineInstr &MI) const override; +}; + +} // namespace llvm +#endif diff --git a/lib/Target/X86/X86RegisterBanks.td b/lib/Target/X86/X86RegisterBanks.td new file mode 100644 index 0000000000000..6d17cd53a0c14 --- /dev/null +++ b/lib/Target/X86/X86RegisterBanks.td @@ -0,0 +1,17 @@ +//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers: RAX, RCX,... +def GPRRegBank : RegisterBank<"GPR", [GR64]>; + +/// Floating Point/Vector Registers +def VECRRegBank : RegisterBank<"VECR", [VR512]>; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 65f438f94b042..9bab9a4cf3ba4 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -80,7 +80,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT) bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - // ExeDepsFixer and PostRAScheduler require liveness. + // ExecutionDepsFixer and PostRAScheduler require liveness. return true; } @@ -337,7 +337,9 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_64_AllRegs_AVX512_SaveList; if (HasAVX) return CSR_64_AllRegs_AVX_SaveList; - return CSR_64_AllRegs_SaveList; + if (HasSSE) + return CSR_64_AllRegs_SaveList; + return CSR_64_AllRegs_NoSSE_SaveList; } else { if (HasAVX512) return CSR_32_AllRegs_AVX512_SaveList; @@ -447,7 +449,9 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_AllRegs_AVX512_RegMask; if (HasAVX) return CSR_64_AllRegs_AVX_RegMask; - return CSR_64_AllRegs_RegMask; + if (HasSSE) + return CSR_64_AllRegs_RegMask; + return CSR_64_AllRegs_NoSSE_RegMask; } else { if (HasAVX512) return CSR_32_AllRegs_AVX512_RegMask; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 372a15aff15a0..c177ba1d52f7c 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -189,22 +189,22 @@ def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; -def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>; -def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>; -def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>; -def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>; -def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>; -def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>; -def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>; -def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>; -def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>; -def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>; -def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>; -def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>; -def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>; -def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>; -def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>; -def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>; +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>; } // CostPerUse @@ -437,8 +437,10 @@ def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, (add LOW32_ADDR_ACCESS, RBP)>; -// A class to support the 'A' assembler constraint: EAX then EDX. +// A class to support the 'A' assembler constraint: [ER]AX then [ER]DX. +def GR16_AD : RegisterClass<"X86", [i16], 16, (add AX, DX)>; def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; +def GR64_AD : RegisterClass<"X86", [i64], 64, (add RAX, RDX)>; // Scalar SSE2 floating point registers. def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 35257f89100ca..7f7efd7cad3f6 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -366,6 +366,7 @@ def IIC_SSE_MWAIT : InstrItinClass; def IIC_SSE_MONITOR : InstrItinClass; def IIC_SSE_MWAITX : InstrItinClass; def IIC_SSE_MONITORX : InstrItinClass; +def IIC_SSE_CLZERO : InstrItinClass; def IIC_SSE_PREFETCH : InstrItinClass; def IIC_SSE_PAUSE : InstrItinClass; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index f031a281e5dd0..9da8a18965ea6 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -85,10 +85,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args)) - .setDiscardResult(); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), + std::move(Args)) + .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 11115524c8109..2cebb76022ef8 100644 --- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -14,7 +14,7 @@ #include "X86ShuffleDecodeConstantPool.h" #include "Utils/X86ShuffleDecode.h" -#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/APInt.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" @@ -25,7 +25,7 @@ namespace llvm { static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits, - SmallBitVector &UndefElts, + APInt &UndefElts, SmallVectorImpl<uint64_t> &RawMask) { // It is not an error for shuffle masks to not be a vector of // MaskEltSizeInBits because the constant pool uniques constants by their @@ -49,6 +49,33 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits, unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumCstElts = CstTy->getVectorNumElements(); + assert((CstSizeInBits % MaskEltSizeInBits) == 0 && + "Unaligned shuffle mask size"); + + unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits; + UndefElts = APInt(NumMaskElts, 0); + RawMask.resize(NumMaskElts, 0); + + // Fast path - if the constants match the mask size then copy direct. + if (MaskEltSizeInBits == CstEltSizeInBits) { + assert(NumCstElts == NumMaskElts && "Unaligned shuffle mask size"); + for (unsigned i = 0; i != NumMaskElts; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) + return false; + + if (isa<UndefValue>(COp)) { + UndefElts.setBit(i); + RawMask[i] = 0; + continue; + } + + auto *Elt = cast<ConstantInt>(COp); + RawMask[i] = Elt->getValue().getZExtValue(); + } + return true; + } + // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(CstSizeInBits, 0); APInt MaskBits(CstSizeInBits, 0); @@ -57,39 +84,30 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits, if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) return false; + unsigned BitOffset = i * CstEltSizeInBits; + if (isa<UndefValue>(COp)) { - APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits); - UndefBits |= EltUndef.shl(i * CstEltSizeInBits); + UndefBits.setBits(BitOffset, BitOffset + CstEltSizeInBits); continue; } - APInt EltBits = cast<ConstantInt>(COp)->getValue(); - EltBits = EltBits.zextOrTrunc(CstSizeInBits); - MaskBits |= EltBits.shl(i * CstEltSizeInBits); + MaskBits.insertBits(cast<ConstantInt>(COp)->getValue(), BitOffset); } // Now extract the undef/constant bit data into the raw shuffle masks. - assert((CstSizeInBits % MaskEltSizeInBits) == 0 && - "Unaligned shuffle mask size"); - - unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits; - UndefElts = SmallBitVector(NumMaskElts, false); - RawMask.resize(NumMaskElts, 0); - for (unsigned i = 0; i != NumMaskElts; ++i) { - APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits); - EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits); + unsigned BitOffset = i * MaskEltSizeInBits; + APInt EltUndef = UndefBits.extractBits(MaskEltSizeInBits, BitOffset); // Only treat the element as UNDEF if all bits are UNDEF, otherwise // treat it as zero. if (EltUndef.isAllOnesValue()) { - UndefElts[i] = true; + UndefElts.setBit(i); RawMask[i] = 0; continue; } - APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits); - EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits); + APInt EltBits = MaskBits.extractBits(MaskEltSizeInBits, BitOffset); RawMask[i] = EltBits.getZExtValue(); } @@ -104,8 +122,8 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { "Unexpected vector size."); // The shuffle mask requires a byte vector. - SmallBitVector UndefElts; - SmallVector<uint64_t, 32> RawMask; + APInt UndefElts; + SmallVector<uint64_t, 64> RawMask; if (!extractConstantMask(C, 8, UndefElts, RawMask)) return; @@ -145,8 +163,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size."); // The shuffle mask requires elements the same size as the target. - SmallBitVector UndefElts; - SmallVector<uint64_t, 8> RawMask; + APInt UndefElts; + SmallVector<uint64_t, 16> RawMask; if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; @@ -180,7 +198,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize, assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size."); // The shuffle mask requires elements the same size as the target. - SmallBitVector UndefElts; + APInt UndefElts; SmallVector<uint64_t, 8> RawMask; if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; @@ -231,8 +249,8 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { "Unexpected vector size."); // The shuffle mask requires a byte vector. - SmallBitVector UndefElts; - SmallVector<uint64_t, 32> RawMask; + APInt UndefElts; + SmallVector<uint64_t, 16> RawMask; if (!extractConstantMask(C, 8, UndefElts, RawMask)) return; @@ -286,8 +304,8 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize, "Unexpected vector element size."); // The shuffle mask requires elements the same size as the target. - SmallBitVector UndefElts; - SmallVector<uint64_t, 8> RawMask; + APInt UndefElts; + SmallVector<uint64_t, 64> RawMask; if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; @@ -314,8 +332,8 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, "Unexpected vector element size."); // The shuffle mask requires elements the same size as the target. - SmallBitVector UndefElts; - SmallVector<uint64_t, 8> RawMask; + APInt UndefElts; + SmallVector<uint64_t, 64> RawMask; if (!extractConstantMask(C, ElSize, UndefElts, RawMask)) return; diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 586bb7bd7b1a5..92a68759195c8 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -11,19 +11,23 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/X86BaseInfo.h" #include "X86Subtarget.h" -#include "X86InstrInfo.h" #include "X86TargetMachine.h" +#include "llvm/ADT/Triple.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Host.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" +#include <cassert> +#include <string> #if defined(_MSC_VER) #include <intrin.h> @@ -93,8 +97,17 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV, return X86II::MO_NO_FLAG; // Absolute symbols can be referenced directly. - if (GV && GV->isAbsoluteSymbolRef()) - return X86II::MO_NO_FLAG; + if (GV) { + if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) { + // See if we can use the 8-bit immediate form. Note that some instructions + // will sign extend the immediate operand, so to be conservative we only + // accept the range [0,128). + if (CR->getUnsignedMax().ult(128)) + return X86II::MO_ABS8; + else + return X86II::MO_NO_FLAG; + } + } if (TM.shouldAssumeDSOLocal(M, GV)) return classifyLocalReference(GV); @@ -195,7 +208,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { FullFS = "+sahf"; } - // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); @@ -263,7 +275,6 @@ void X86Subtarget::initializeEnvironment() { HasVBMI = false; HasIFMA = false; HasRTM = false; - HasHLE = false; HasERI = false; HasCDI = false; HasPFI = false; @@ -277,6 +288,7 @@ void X86Subtarget::initializeEnvironment() { HasRDSEED = false; HasLAHFSAHF = false; HasMWAITX = false; + HasCLZERO = false; HasMPX = false; IsBTMemSlow = false; IsPMULLDSlow = false; @@ -286,10 +298,11 @@ void X86Subtarget::initializeEnvironment() { HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; - HasFastPartialYMMWrite = false; + HasFastPartialYMMorZMMWrite = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; HasFastLZCNT = false; + HasFastSHLDRotate = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; @@ -321,7 +334,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), - TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)), + InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) { // Determine the PICStyle based on the target selected. if (!isPositionIndependent()) @@ -359,4 +372,3 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; } - diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index d80dc4a9b5e80..d0d88d3269492 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -18,33 +18,36 @@ #include "X86ISelLowering.h" #include "X86InstrInfo.h" #include "X86SelectionDAGInfo.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/IR/CallingConv.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include <string> +#include <memory> #define GET_SUBTARGETINFO_HEADER #include "X86GenSubtargetInfo.inc" namespace llvm { + class GlobalValue; -class StringRef; -class TargetMachine; /// The X86 backend supports a number of different styles of PIC. /// namespace PICStyles { + enum Style { StubPIC, // Used on i386-darwin in pic mode. GOT, // Used on 32 bit elf on when in pic mode. RIPRel, // Used on X86-64 when in pic mode. None // Set when not in pic mode. }; -} -class X86Subtarget final : public X86GenSubtargetInfo { +} // end namespace PICStyles +class X86Subtarget final : public X86GenSubtargetInfo { protected: enum X86SSEEnum { NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F @@ -96,10 +99,13 @@ protected: /// Target has XSAVE instructions bool HasXSAVE; + /// Target has XSAVEOPT instructions bool HasXSAVEOPT; + /// Target has XSAVEC instructions bool HasXSAVEC; + /// Target has XSAVES instructions bool HasXSAVES; @@ -148,9 +154,6 @@ protected: /// Processor has RTM instructions. bool HasRTM; - /// Processor has HLE. - bool HasHLE; - /// Processor has ADX instructions. bool HasADX; @@ -169,6 +172,9 @@ protected: /// Processor has MONITORX/MWAITX instructions. bool HasMWAITX; + /// Processor has Cache Line Zero instruction + bool HasCLZERO; + /// Processor has Prefetch with intent to Write instruction bool HasPFPREFETCHWT1; @@ -201,8 +207,8 @@ protected: bool UseLeaForSP; /// True if there is no performance penalty to writing only the lower parts - /// of a YMM register without clearing the upper part. - bool HasFastPartialYMMWrite; + /// of a YMM or ZMM register without clearing the upper part. + bool HasFastPartialYMMorZMMWrite; /// True if hardware SQRTSS instruction is at least as fast (latency) as /// RSQRTSS followed by a Newton-Raphson iteration. @@ -223,6 +229,9 @@ protected: /// True if LZCNT instruction is fast. bool HasFastLZCNT; + /// True if SHLD based rotate is fast. + bool HasFastSHLDRotate; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -265,24 +274,12 @@ protected: /// Processor supports MPX - Memory Protection Extensions bool HasMPX; - /// Processor supports Invalidate Process-Context Identifier - bool HasInvPCId; - - /// Processor has VM Functions - bool HasVMFUNC; - - /// Processor has Supervisor Mode Access Protection - bool HasSMAP; - /// Processor has Software Guard Extensions bool HasSGX; /// Processor supports Flush Cache Line instruction bool HasCLFLUSHOPT; - /// Processor has Persistent Commit feature - bool HasPCOMMIT; - /// Processor supports Cache Line Write Back instruction bool HasCLWB; @@ -307,8 +304,8 @@ protected: /// This is used to avoid ifndefs spreading around while GISel is /// an optional library. std::unique_ptr<GISelAccessor> GISel; -private: +private: /// Override the stack alignment. unsigned StackAlignOverride; @@ -341,13 +338,17 @@ public: const X86TargetLowering *getTargetLowering() const override { return &TLInfo; } + const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const X86FrameLowering *getFrameLowering() const override { return &FrameLowering; } + const X86SelectionDAGInfo *getSelectionDAGInfo() const override { return &TSInfo; } + const X86RegisterInfo *getRegisterInfo() const override { return &getInstrInfo()->getRegisterInfo(); } @@ -370,12 +371,14 @@ public: const InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; + private: /// Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); void initSubtargetFeatures(StringRef CPU, StringRef FS); + public: /// Is this x86_64? (disregarding specific ABI / programming model) bool is64Bit() const { @@ -432,9 +435,9 @@ public: bool hasPCLMUL() const { return HasPCLMUL; } // Prefer FMA4 to FMA - its better for commutation/memory folding and // has equal or better performance on all supported targets. - bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; } bool hasFMA4() const { return HasFMA4; } - bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } + bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } @@ -447,13 +450,13 @@ public: bool hasVBMI() const { return HasVBMI; } bool hasIFMA() const { return HasIFMA; } bool hasRTM() const { return HasRTM; } - bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool hasMWAITX() const { return HasMWAITX; } + bool hasCLZERO() const { return HasCLZERO; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isPMULLDSlow() const { return IsPMULLDSlow; } @@ -462,10 +465,13 @@ public: bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } - bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } + bool hasFastPartialYMMorZMMWrite() const { + return HasFastPartialYMMorZMMWrite; + } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } + bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } @@ -481,8 +487,9 @@ public: bool hasVLX() const { return HasVLX; } bool hasPKU() const { return HasPKU; } bool hasMPX() const { return HasMPX; } + bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } - virtual bool isXRaySupported() const override { return is64Bit(); } + bool isXRaySupported() const override { return is64Bit(); } bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } @@ -513,6 +520,7 @@ public: bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } + bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } bool isTargetWindowsMSVC() const { return TargetTriple.isWindowsMSVCEnvironment(); @@ -616,6 +624,9 @@ public: /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } + // TODO: Update the regression tests and return true. + bool supportPrintSchedInfo() const override { return false; } + bool enableEarlyIfConversion() const override; /// Return the instruction itineraries based on the subtarget selection. @@ -628,6 +639,6 @@ public: } }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index aa5cfc64e9ebc..03a1958121ab8 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -11,22 +11,47 @@ // //===----------------------------------------------------------------------===// -#include "X86TargetMachine.h" +#include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" #include "X86CallLowering.h" +#include "X86LegalizerInfo.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "X86RegisterBankInfo.h" +#endif +#include "X86MacroFusion.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/ExecutionDepsFix.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" -#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" +#include <memory> +#include <string> + using namespace llvm; static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", @@ -34,8 +59,11 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", cl::init(true), cl::Hidden); namespace llvm { + void initializeWinEHStatePassPass(PassRegistry &); -} +void initializeX86ExecutionDepsFixPass(PassRegistry &); + +} // end namespace llvm extern "C" void LLVMInitializeX86Target() { // Register the target. @@ -47,27 +75,28 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeX86ExecutionDepsFixPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) { if (TT.getArch() == Triple::x86_64) - return make_unique<X86_64MachoTargetObjectFile>(); - return make_unique<TargetLoweringObjectFileMachO>(); + return llvm::make_unique<X86_64MachoTargetObjectFile>(); + return llvm::make_unique<TargetLoweringObjectFileMachO>(); } if (TT.isOSFreeBSD()) - return make_unique<X86FreeBSDTargetObjectFile>(); + return llvm::make_unique<X86FreeBSDTargetObjectFile>(); if (TT.isOSLinux() || TT.isOSNaCl()) - return make_unique<X86LinuxNaClTargetObjectFile>(); + return llvm::make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSFuchsia()) - return make_unique<X86FuchsiaTargetObjectFile>(); + return llvm::make_unique<X86FuchsiaTargetObjectFile>(); if (TT.isOSBinFormatELF()) - return make_unique<X86ELFTargetObjectFile>(); + return llvm::make_unique<X86ELFTargetObjectFile>(); if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment()) - return make_unique<X86WindowsTargetObjectFile>(); + return llvm::make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) - return make_unique<TargetLoweringObjectFileCOFF>(); + return llvm::make_unique<TargetLoweringObjectFileCOFF>(); llvm_unreachable("unknown subtarget type"); } @@ -177,31 +206,37 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -X86TargetMachine::~X86TargetMachine() {} +X86TargetMachine::~X86TargetMachine() = default; #ifdef LLVM_BUILD_GLOBAL_ISEL namespace { + struct X86GISelActualAccessor : public GISelAccessor { - std::unique_ptr<CallLowering> CL; - X86GISelActualAccessor(CallLowering* CL): CL(CL) {} + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + std::unique_ptr<InstructionSelector> InstSelector; + const CallLowering *getCallLowering() const override { - return CL.get(); + return CallLoweringInfo.get(); } + const InstructionSelector *getInstructionSelector() const override { - //TODO: Implement - return nullptr; + return InstSelector.get(); } + const LegalizerInfo *getLegalizerInfo() const override { - //TODO: Implement - return nullptr; + return Legalizer.get(); } + const RegisterBankInfo *getRegBankInfo() const override { - //TODO: Implement - return nullptr; + return RegBankInfo.get(); } }; -} // End anonymous namespace. + +} // end anonymous namespace #endif + const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); @@ -244,8 +279,14 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { #ifndef LLVM_BUILD_GLOBAL_ISEL GISelAccessor *GISel = new GISelAccessor(); #else - X86GISelActualAccessor *GISel = new X86GISelActualAccessor( - new X86CallLowering(*I->getTargetLowering())); + X86GISelActualAccessor *GISel = new X86GISelActualAccessor(); + + GISel->CallLoweringInfo.reset(new X86CallLowering(*I->getTargetLowering())); + GISel->Legalizer.reset(new X86LegalizerInfo(*I, *this)); + + auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo()); + GISel->RegBankInfo.reset(RBI); + GISel->InstSelector.reset(createX86InstructionSelector(*I, *RBI)); #endif I->setGISelAccessor(*GISel); } @@ -270,12 +311,12 @@ TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { }); } - //===----------------------------------------------------------------------===// // Pass Pipeline Configuration //===----------------------------------------------------------------------===// namespace { + /// X86 Code Generator Pass Configuration Options. class X86PassConfig : public TargetPassConfig { public: @@ -289,7 +330,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMILive *DAG = createGenericSchedLive(C); - DAG->addMutation(createMacroFusionDAGMutation(DAG->TII)); + DAG->addMutation(createX86MacroFusionDAGMutation()); return DAG; } @@ -301,14 +342,28 @@ public: bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; #endif -bool addILPOpts() override; + bool addILPOpts() override; bool addPreISel() override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; void addPreSched2() override; }; -} // namespace + +class X86ExecutionDepsFix : public ExecutionDepsFix { +public: + static char ID; + X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {} + StringRef getPassName() const override { + return "X86 Execution Dependency Fix"; + } +}; +char X86ExecutionDepsFix::ID; + +} // end anonymous namespace + +INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix", + "X86 Execution Dependency Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(this, PM); @@ -343,17 +398,17 @@ bool X86PassConfig::addIRTranslator() { } bool X86PassConfig::addLegalizeMachineIR() { - //TODO: Implement + addPass(new Legalizer()); return false; } bool X86PassConfig::addRegBankSelect() { - //TODO: Implement + addPass(new RegBankSelect()); return false; } bool X86PassConfig::addGlobalInstructionSelect() { - //TODO: Implement + addPass(new InstructionSelect()); return false; } #endif @@ -391,7 +446,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } void X86PassConfig::addPreEmitPass() { if (getOptLevel() != CodeGenOpt::None) - addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass)); + addPass(new X86ExecutionDepsFix()); if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index d756d07926dd0..cf933f52604ef 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -13,14 +13,20 @@ #ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H #define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H -#include "X86InstrInfo.h" + #include "X86Subtarget.h" -#include "llvm/IR/DataLayout.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Target/TargetMachine.h" +#include <memory> namespace llvm { class StringRef; +class X86Subtarget; +class X86RegisterBankInfo; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; @@ -32,17 +38,19 @@ public: Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~X86TargetMachine() override; + const X86Subtarget *getSubtargetImpl(const Function &F) const override; TargetIRAnalysis getTargetIRAnalysis() override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 5715d826862e9..b742fb472372c 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { return 8; } -unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { +unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { if (ST->hasAVX512()) return 512; @@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { return 32; } +unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { + return getRegisterBitWidth(true); +} + unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { // If the loop will not be vectorized, don't interleave the loop. // Let regular unroll to unroll the loop, which saves the overflow @@ -114,7 +118,7 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { } int X86TTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, + unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, @@ -207,6 +211,10 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry AVX512UniformConstCostTable[] = { + { ISD::SRA, MVT::v2i64, 1 }, + { ISD::SRA, MVT::v4i64, 1 }, + { ISD::SRA, MVT::v8i64, 1 }, + { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence }; @@ -319,6 +327,14 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; static const CostTblEntry AVX512BWCostTable[] = { + { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v8i16, 1 }, // vpsravw + + { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v16i16, 1 }, // vpsravw + { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw { ISD::SRA, MVT::v32i16, 1 }, // vpsravw @@ -347,8 +363,12 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SHL, MVT::v16i32, 1 }, { ISD::SRL, MVT::v16i32, 1 }, { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v8i64, 1 }, { ISD::SRL, MVT::v8i64, 1 }, + + { ISD::SRA, MVT::v2i64, 1 }, + { ISD::SRA, MVT::v4i64, 1 }, { ISD::SRA, MVT::v8i64, 1 }, { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. @@ -595,7 +615,6 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. @@ -804,7 +823,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw - { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb + { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb + + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb + // + vpblendvb + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 } // vperm2i128 + 2 * vpshufb + // + vpblendvb }; if (ST->hasAVX2()) @@ -861,7 +887,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por - { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por + { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por + + { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb }; if (ST->hasSSSE3()) @@ -886,7 +915,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por - { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por + { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por + + { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd }; if (ST->hasSSE2()) @@ -906,7 +938,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -1272,7 +1305,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return BaseT::getCastInstrCost(Opcode, Dst, Src); } -int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1338,11 +1372,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF) { + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { // Costs should match the codegen from: // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll @@ -1418,8 +1453,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v4f64, 43 }, // SNB from http://www.agner.org/ }; static const CostTblEntry SSE42CostTbl[] = { - { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ - { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ + { ISD::FSQRT, MVT::f32, 18 }, // Nehalem from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/ }; static const CostTblEntry SSSE3CostTbl[] = { { ISD::BITREVERSE, MVT::v2i64, 5 }, @@ -1443,6 +1478,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::CTTZ, MVT::v16i8, 9 } }; static const CostTblEntry SSE2CostTbl[] = { + { ISD::BITREVERSE, MVT::v2i64, 29 }, + { ISD::BITREVERSE, MVT::v4i32, 27 }, + { ISD::BITREVERSE, MVT::v8i16, 27 }, + { ISD::BITREVERSE, MVT::v16i8, 20 }, { ISD::BSWAP, MVT::v2i64, 7 }, { ISD::BSWAP, MVT::v4i32, 7 }, { ISD::BSWAP, MVT::v8i16, 7 }, @@ -1462,8 +1501,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ }; static const CostTblEntry SSE1CostTbl[] = { - { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ - { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ + { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ + { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ + }; + static const CostTblEntry X64CostTbl[] = { // 64-bit targets + { ISD::BITREVERSE, MVT::i64, 14 } + }; + static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets + { ISD::BITREVERSE, MVT::i32, 14 }, + { ISD::BITREVERSE, MVT::i16, 14 }, + { ISD::BITREVERSE, MVT::i8, 11 } }; unsigned ISD = ISD::DELETED_NODE; @@ -1523,12 +1570,19 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF); + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF) { - return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF); + ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); } int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -1562,22 +1616,8 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } -int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { - assert (Ty->isVectorTy() && "Can only scalarize vectors"); - int Cost = 0; - - for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { - if (Insert) - Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); - if (Extract) - Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); - } - - return Cost; -} - int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, const Instruction *I) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -2132,7 +2172,7 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() { // TODO: We expect this to be beneficial regardless of arch, // but there are currently some unexplained performance artifacts on Atom. // As a temporary solution, disable on Atom. - return !(ST->isAtom() || ST->isSLM()); + return !(ST->isAtom()); } // Get estimation for interleaved load/store operations and strided load. diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index ecaaf951cff7a..9bef9e80c395c 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -33,8 +33,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { const X86Subtarget *ST; const X86TargetLowering *TLI; - int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); - const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } @@ -53,7 +51,8 @@ public: /// @{ unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector); + unsigned getRegisterBitWidth(bool Vector) const; + unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, @@ -63,11 +62,13 @@ public: TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef<const Value *> Args = ArrayRef<const Value *>()); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + const Instruction *I = nullptr); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, const Instruction *I = nullptr); int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, @@ -76,9 +77,11 @@ public: const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF); + ArrayRef<Type *> Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF); + ArrayRef<Value *> Args, FastMathFlags FMF, + unsigned VF = 1); int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 9766b84be6521..d17dfac6a9974 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -56,11 +56,11 @@ namespace { // Core algorithm state: // BlockState - Each block is either: - // - PASS_THROUGH: There are neither YMM dirtying instructions nor + // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor // vzeroupper instructions in this block. // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this - // block that will ensure that YMM is clean on exit. - // - EXITS_DIRTY: An instruction in the block dirties YMM and no + // block that will ensure that YMM/ZMM is clean on exit. + // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no // subsequent vzeroupper in the block clears it. // // AddedToDirtySuccessors - This flag is raised when a block is added to the @@ -97,6 +97,7 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); } +#ifndef NDEBUG const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { switch (ST) { case PASS_THROUGH: return "Pass-through"; @@ -105,52 +106,56 @@ const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) { } llvm_unreachable("Invalid block exit state."); } +#endif -static bool isYmmReg(unsigned Reg) { - return (Reg >= X86::YMM0 && Reg <= X86::YMM15); +/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only. +/// Thus, there is no need to check for Y/ZMM16 and above. +static bool isYmmOrZmmReg(unsigned Reg) { + return (Reg >= X86::YMM0 && Reg <= X86::YMM15) || + (Reg >= X86::ZMM0 && Reg <= X86::ZMM15); } -static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { +static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first)) + if (isYmmOrZmmReg(I->first)) return true; return false; } -static bool clobbersAllYmmRegs(const MachineOperand &MO) { +static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) { for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } + for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } return true; } -static bool hasYmmReg(MachineInstr &MI) { +static bool hasYmmOrZmmReg(MachineInstr &MI) { for (const MachineOperand &MO : MI.operands()) { - if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO)) + if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO)) return true; if (!MO.isReg()) continue; if (MO.isDebug()) continue; - if (isYmmReg(MO.getReg())) + if (isYmmOrZmmReg(MO.getReg())) return true; } return false; } -/// Check if any YMM register will be clobbered by this instruction. -static bool callClobbersAnyYmmReg(MachineInstr &MI) { +/// Check if given call instruction has a RegMask operand. +static bool callHasRegMask(MachineInstr &MI) { assert(MI.isCall() && "Can only be called on call instructions."); for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { - if (MO.clobbersPhysReg(reg)) - return true; - } + if (MO.isRegMask()) + return true; } return false; } @@ -175,17 +180,20 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) { /// Loop over all of the instructions in the basic block, inserting vzeroupper /// instructions before function calls. void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { - // Start by assuming that the block is PASS_THROUGH which implies no unguarded // calls. BlockExitState CurState = PASS_THROUGH; BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end(); for (MachineInstr &MI : MBB) { + bool IsCall = MI.isCall(); + bool IsReturn = MI.isReturn(); + bool IsControlFlow = IsCall || IsReturn; + // No need for vzeroupper before iret in interrupt handler function, - // epilogue will restore YMM registers if needed. - bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn(); - bool IsControlFlow = MI.isCall() || MI.isReturn(); + // epilogue will restore YMM/ZMM registers if needed. + if (IsX86INTR && IsReturn) + continue; // An existing VZERO* instruction resets the state. if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) { @@ -194,30 +202,30 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { } // Shortcut: don't need to check regular instructions in dirty state. - if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY) + if (!IsControlFlow && CurState == EXITS_DIRTY) continue; - if (hasYmmReg(MI)) { - // We found a ymm-using instruction; this could be an AVX instruction, - // or it could be control flow. + if (hasYmmOrZmmReg(MI)) { + // We found a ymm/zmm-using instruction; this could be an AVX/AVX512 + // instruction, or it could be control flow. CurState = EXITS_DIRTY; continue; } // Check for control-flow out of the current function (which might // indirectly execute SSE instructions). - if (!IsControlFlow || IsReturnFromX86INTR) + if (!IsControlFlow) continue; - // If the call won't clobber any YMM register, skip it as well. It usually - // happens on helper function calls (such as '_chkstk', '_ftol2') where - // standard calling convention is not used (RegMask is not used to mark - // register clobbered and register usage (def/imp-def/use) is well-defined - // and explicitly specified. - if (MI.isCall() && !callClobbersAnyYmmReg(MI)) + // If the call has no RegMask, skip it as well. It usually happens on + // helper function calls (such as '_chkstk', '_ftol2') where standard + // calling convention is not used (RegMask is not used to mark register + // clobbered and register usage (def/imp-def/use) is well-defined and + // explicitly specified. + if (IsCall && !callHasRegMask(MI)) continue; - // The VZEROUPPER instruction resets the upper 128 bits of all AVX + // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15 // registers. In addition, the processor changes back to Clean state, after // which execution of SSE instructions or AVX instructions has no transition // penalty. Add the VZEROUPPER instruction before any function call/return @@ -226,7 +234,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { // predecessor block. if (CurState == EXITS_DIRTY) { // After the inserted VZEROUPPER the state becomes clean again, but - // other YMM may appear before other subsequent calls or even before + // other YMM/ZMM may appear before other subsequent calls or even before // the end of the BB. insertVZeroUpper(MI, MBB); CurState = EXITS_CLEAN; @@ -257,30 +265,32 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { /// function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite()) + if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite()) return false; TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR; - bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); - - // Fast check: if the function doesn't use any ymm registers, we don't need - // to insert any VZEROUPPER instructions. This is constant-time, so it is - // cheap in the common case of no ymm use. - bool YMMUsed = FnHasLiveInYmm; - if (!YMMUsed) { - const TargetRegisterClass *RC = &X86::VR256RegClass; - for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; - i++) { - if (!MRI.reg_nodbg_empty(*i)) { - YMMUsed = true; - break; + bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI); + + // Fast check: if the function doesn't use any ymm/zmm registers, we don't + // need to insert any VZEROUPPER instructions. This is constant-time, so it + // is cheap in the common case of no ymm/zmm use. + bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm; + const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass}; + for (auto *RC : RCs) { + if (!YmmOrZmmUsed) { + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; + i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YmmOrZmmUsed = true; + break; + } } } } - if (!YMMUsed) { + if (!YmmOrZmmUsed) { return false; } @@ -294,9 +304,9 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) processBasicBlock(MBB); - // If any YMM regs are live-in to this function, add the entry block to the - // DirtySuccessors list - if (FnHasLiveInYmm) + // If any YMM/ZMM regs are live-in to this function, add the entry block to + // the DirtySuccessors list + if (FnHasLiveInYmmOrZmm) addDirtySuccessor(MF.front()); // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add |