diff options
Diffstat (limited to 'lib/Target/X86')
78 files changed, 15186 insertions, 7633 deletions
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt index 2c1926edc26b..b022a41b192f 100644 --- a/lib/Target/X86/AsmParser/CMakeLists.txt +++ b/lib/Target/X86/AsmParser/CMakeLists.txt @@ -1,7 +1,4 @@ add_llvm_library(LLVMX86AsmParser X86AsmInstrumentation.cpp X86AsmParser.cpp - - LINK_LIBS - LLVMX86CodeGen ) diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt index 284bfd052ccf..9f94d5d38864 100644 --- a/lib/Target/X86/AsmParser/LLVMBuild.txt +++ b/lib/Target/X86/AsmParser/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86AsmParser parent = X86 -required_libraries = MC MCParser Support X86CodeGen X86Desc X86Info +required_libraries = MC MCParser Support X86Desc X86Info add_to_library_groups = X86 diff --git a/lib/Target/X86/AsmParser/Makefile b/lib/Target/X86/AsmParser/Makefile index fb9760796622..f834dfc300a1 100644 --- a/lib/Target/X86/AsmParser/Makefile +++ b/lib/Target/X86/AsmParser/Makefile @@ -9,7 +9,7 @@ LEVEL = ../../../.. LIBRARYNAME = LLVMX86AsmParser -# Hack: we need to include 'main' x86 target directory to grab private headers +# Hack: we need to include 'main' X86 target directory to grab private headers CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. include $(LEVEL)/Makefile.common diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 9eee4a0f3d82..09cc53a8e6d3 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -10,10 +10,8 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "X86AsmInstrumentation.h" #include "X86Operand.h" -#include "X86RegisterInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" @@ -118,11 +116,6 @@ bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } -std::string FuncName(unsigned AccessSize, bool IsWrite) { - return std::string("__asan_report_") + (IsWrite ? "store" : "load") + - utostr(AccessSize); -} - class X86AddressSanitizer : public X86AsmInstrumentation { public: struct RegisterContext { @@ -136,26 +129,26 @@ public: public: RegisterContext(unsigned AddressReg, unsigned ShadowReg, unsigned ScratchReg) { - BusyRegs.push_back(convReg(AddressReg, MVT::i64)); - BusyRegs.push_back(convReg(ShadowReg, MVT::i64)); - BusyRegs.push_back(convReg(ScratchReg, MVT::i64)); + BusyRegs.push_back(convReg(AddressReg, 64)); + BusyRegs.push_back(convReg(ShadowReg, 64)); + BusyRegs.push_back(convReg(ScratchReg, 64)); } - unsigned AddressReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT); + unsigned AddressReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size); } - unsigned ShadowReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_SHADOW], VT); + unsigned ShadowReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SHADOW], Size); } - unsigned ScratchReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT); + unsigned ScratchReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size); } void AddBusyReg(unsigned Reg) { if (Reg != X86::NoRegister) - BusyRegs.push_back(convReg(Reg, MVT::i64)); + BusyRegs.push_back(convReg(Reg, 64)); } void AddBusyRegs(const X86Operand &Op) { @@ -163,36 +156,36 @@ public: AddBusyReg(Op.getMemIndexReg()); } - unsigned ChooseFrameReg(MVT::SimpleValueType VT) const { + unsigned ChooseFrameReg(unsigned Size) const { static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, X86::RCX, X86::RDX, X86::RDI, X86::RSI }; for (unsigned Reg : Candidates) { if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) - return convReg(Reg, VT); + return convReg(Reg, Size); } return X86::NoRegister; } private: - unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const { - return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT); + unsigned convReg(unsigned Reg, unsigned Size) const { + return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size); } std::vector<unsigned> BusyRegs; }; - X86AddressSanitizer(const MCSubtargetInfo &STI) + X86AddressSanitizer(const MCSubtargetInfo *&STI) : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} - virtual ~X86AddressSanitizer() {} + ~X86AddressSanitizer() override {} // X86AsmInstrumentation implementation: - virtual void InstrumentAndEmitInstruction(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) override { + void InstrumentAndEmitInstruction(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) override { InstrumentMOVS(Inst, Operands, Ctx, MII, Out); if (RepPrefix) EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); @@ -240,17 +233,16 @@ public: protected: void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } - void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg, - MCStreamer &Out) { - assert(VT == MVT::i32 || VT == MVT::i64); + void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) { + assert(Size == 32 || Size == 64); MCInst Inst; - Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r); - Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, VT))); + Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r); + Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size))); Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT, + void ComputeMemOperandAddress(X86Operand &Op, unsigned Size, unsigned Reg, MCContext &Ctx, MCStreamer &Out); // Creates new memory operand with Displacement added to an original @@ -261,13 +253,13 @@ protected: MCContext &Ctx, int64_t *Residue); bool is64BitMode() const { - return STI.getFeatureBits()[X86::Mode64Bit]; + return STI->getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { - return STI.getFeatureBits()[X86::Mode32Bit]; + return STI->getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { - return STI.getFeatureBits()[X86::Mode16Bit]; + return STI->getFeatureBits()[X86::Mode16Bit]; } unsigned getPointerWidth() { @@ -437,7 +429,7 @@ void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, } void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, - MVT::SimpleValueType VT, + unsigned Size, unsigned Reg, MCContext &Ctx, MCStreamer &Out) { int64_t Displacement = 0; @@ -450,14 +442,14 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, // Emit Op as is. if (Displacement == 0) { - EmitLEA(Op, VT, Reg, Out); + EmitLEA(Op, Size, Reg, Out); return; } int64_t Residue; std::unique_ptr<X86Operand> NewOp = AddDisplacement(Op, Displacement, Ctx, &Residue); - EmitLEA(*NewOp, VT, Reg, Out); + EmitLEA(*NewOp, Size, Reg, Out); while (Residue != 0) { const MCConstantExpr *Disp = @@ -465,7 +457,7 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, std::unique_ptr<X86Operand> DispOp = X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), SMLoc()); - EmitLEA(*DispOp, VT, Reg, Out); + EmitLEA(*DispOp, Size, Reg, Out); Residue -= Disp->getValue(); } } @@ -503,16 +495,16 @@ class X86AddressSanitizer32 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x20000000; - X86AddressSanitizer32(const MCSubtargetInfo &STI) + X86AddressSanitizer32(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - virtual ~X86AddressSanitizer32() {} + ~X86AddressSanitizer32() override {} unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); if (FrameReg == X86::NoRegister) return FrameReg; - return getX86SubSuperRegister(FrameReg, MVT::i32); + return getX86SubSuperRegister(FrameReg, 32); } void SpillReg(MCStreamer &Out, unsigned Reg) { @@ -535,10 +527,10 @@ public: OrigSPOffset += 4; } - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); assert(LocalFrameReg != X86::NoRegister); const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); @@ -558,24 +550,24 @@ public: MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); } - SpillReg(Out, RegCtx.AddressReg(MVT::i32)); - SpillReg(Out, RegCtx.ShadowReg(MVT::i32)); - if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(MVT::i32)); + SpillReg(Out, RegCtx.AddressReg(32)); + SpillReg(Out, RegCtx.ShadowReg(32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(32)); StoreFlags(Out); } - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); assert(LocalFrameReg != X86::NoRegister); RestoreFlags(Out); - if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(MVT::i32)); - RestoreReg(Out, RegCtx.ShadowReg(MVT::i32)); - RestoreReg(Out, RegCtx.AddressReg(MVT::i32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(32)); + RestoreReg(Out, RegCtx.ShadowReg(32)); + RestoreReg(Out, RegCtx.AddressReg(32)); unsigned FrameReg = GetFrameReg(Ctx, Out); if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { @@ -586,18 +578,18 @@ public: } } - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, @@ -610,10 +602,11 @@ private: .addReg(X86::ESP) .addImm(-16)); EmitInstruction( - Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32))); + Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); - const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); @@ -623,14 +616,14 @@ private: void X86AddressSanitizer32::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( AddressRegI32)); @@ -673,7 +666,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); + EmitLEA(*Op, 32, ScratchRegI32, Out); break; } case 4: @@ -698,10 +691,10 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( void X86AddressSanitizer32::InstrumentMemOperandLarge( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( AddressRegI32)); @@ -760,16 +753,16 @@ class X86AddressSanitizer64 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x7fff8000; - X86AddressSanitizer64(const MCSubtargetInfo &STI) + X86AddressSanitizer64(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - virtual ~X86AddressSanitizer64() {} + ~X86AddressSanitizer64() override {} unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); if (FrameReg == X86::NoRegister) return FrameReg; - return getX86SubSuperRegister(FrameReg, MVT::i64); + return getX86SubSuperRegister(FrameReg, 64); } void SpillReg(MCStreamer &Out, unsigned Reg) { @@ -792,10 +785,10 @@ public: OrigSPOffset += 8; } - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); assert(LocalFrameReg != X86::NoRegister); const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); @@ -816,24 +809,24 @@ public: } EmitAdjustRSP(Ctx, Out, -128); - SpillReg(Out, RegCtx.ShadowReg(MVT::i64)); - SpillReg(Out, RegCtx.AddressReg(MVT::i64)); - if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(MVT::i64)); + SpillReg(Out, RegCtx.ShadowReg(64)); + SpillReg(Out, RegCtx.AddressReg(64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(64)); StoreFlags(Out); } - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); assert(LocalFrameReg != X86::NoRegister); RestoreFlags(Out); - if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(MVT::i64)); - RestoreReg(Out, RegCtx.AddressReg(MVT::i64)); - RestoreReg(Out, RegCtx.ShadowReg(MVT::i64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(64)); + RestoreReg(Out, RegCtx.AddressReg(64)); + RestoreReg(Out, RegCtx.ShadowReg(64)); EmitAdjustRSP(Ctx, Out, 128); unsigned FrameReg = GetFrameReg(Ctx, Out); @@ -845,18 +838,18 @@ public: } } - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { @@ -864,7 +857,7 @@ private: std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i64, X86::RSP, Out); + EmitLEA(*Op, 64, X86::RSP, Out); OrigSPOffset += Offset; } @@ -878,12 +871,13 @@ private: .addReg(X86::RSP) .addImm(-16)); - if (RegCtx.AddressReg(MVT::i64) != X86::RDI) { + if (RegCtx.AddressReg(64) != X86::RDI) { EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( - RegCtx.AddressReg(MVT::i64))); + RegCtx.AddressReg(64))); } - const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); @@ -893,16 +887,16 @@ private: void X86AddressSanitizer64::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( AddressRegI64)); @@ -944,7 +938,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); + EmitLEA(*Op, 32, ScratchRegI32, Out); break; } case 4: @@ -969,10 +963,10 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( void X86AddressSanitizer64::InstrumentMemOperandLarge( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); - unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( AddressRegI64)); @@ -1030,7 +1024,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, } // End anonymous namespace -X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI) +X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) : STI(STI), InitialFrameReg(0) {} X86AsmInstrumentation::~X86AsmInstrumentation() {} @@ -1043,7 +1037,7 @@ void X86AsmInstrumentation::InstrumentAndEmitInstruction( void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst) { - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, *STI); } unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, @@ -1067,17 +1061,17 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI) { - Triple T(STI.getTargetTriple()); + const MCContext &Ctx, const MCSubtargetInfo *&STI) { + Triple T(STI->getTargetTriple()); const bool hasCompilerRTSupport = T.isOSLinux(); if (ClAsanInstrumentAssembly && hasCompilerRTSupport && MCOptions.SanitizeAddress) { - if (STI.getFeatureBits()[X86::Mode32Bit] != 0) + if (STI->getFeatureBits()[X86::Mode32Bit] != 0) return new X86AddressSanitizer32(STI); - if (STI.getFeatureBits()[X86::Mode64Bit] != 0) + if (STI->getFeatureBits()[X86::Mode64Bit] != 0) return new X86AddressSanitizer64(STI); } return new X86AsmInstrumentation(STI); } -} // End llvm namespace +} // end llvm namespace diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 19ebcc44f61e..470ceadb0aa6 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -28,7 +28,8 @@ class X86AsmInstrumentation; X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI); + const MCContext &Ctx, + const MCSubtargetInfo *&STI); class X86AsmInstrumentation { public: @@ -48,15 +49,16 @@ public: protected: friend X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI); + const MCContext &Ctx, + const MCSubtargetInfo *&STI); - X86AsmInstrumentation(const MCSubtargetInfo &STI); + X86AsmInstrumentation(const MCSubtargetInfo *&STI); unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); void EmitInstruction(MCStreamer &Out, const MCInst &Inst); - const MCSubtargetInfo &STI; + const MCSubtargetInfo *&STI; unsigned InitialFrameReg; }; diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index bca059d8c383..4d8ffac1a82b 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,7 +11,6 @@ #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" -#include "X86ISelLowering.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -26,6 +25,7 @@ #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -57,10 +57,10 @@ static const char OpPrecedence[] = { }; class X86AsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -154,6 +154,7 @@ private: // Push the new operator. InfixOperatorStack.push_back(Op); } + int64_t execute() { // Push any remaining operators onto the postfix stack. while (!InfixOperatorStack.empty()) { @@ -268,6 +269,7 @@ private: bool StopOnLBrac, AddImmPrefix; InfixCalculator IC; InlineAsmIdentifierInfo Info; + public: IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), @@ -712,10 +714,10 @@ private: SMLoc End, unsigned Size, StringRef Identifier, InlineAsmIdentifierInfo &Info); + bool parseDirectiveEven(SMLoc L); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); - bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds @@ -758,23 +760,24 @@ private: bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode64Bit]; + return getSTI().getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode32Bit]; + return getSTI().getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode16Bit]; + return getSTI().getFeatureBits()[X86::Mode16Bit]; } void SwitchMode(unsigned mode) { + MCSubtargetInfo &STI = copySTI(); FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; unsigned FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); - + assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes)); } @@ -798,12 +801,12 @@ private: /// } public: - X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser, + X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) { + : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) { // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); Instrumentation.reset( CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); } @@ -912,6 +915,11 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (RegNo == 0) RegNo = MatchRegisterName(Tok.getString().lower()); + // The "flags" register cannot be referenced directly. + // Treat it as an identifier instead. + if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS) + RegNo = 0; + if (!is64BitMode()) { // FIXME: This should be done using Requires<Not64BitMode> and // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also @@ -1042,8 +1050,11 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { .Cases("BYTE", "byte", 8) .Cases("WORD", "word", 16) .Cases("DWORD", "dword", 32) + .Cases("FWORD", "fword", 48) .Cases("QWORD", "qword", 64) + .Cases("MMWORD","mmword", 64) .Cases("XWORD", "xword", 80) + .Cases("TBYTE", "tbyte", 80) .Cases("XMMWORD", "xmmword", 128) .Cases("YMMWORD", "ymmword", 256) .Cases("ZMMWORD", "zmmword", 512) @@ -1062,8 +1073,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // Insert an explicit size if the user didn't have one. if (!Size) { Size = getPointerWidth(); - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, - /*Len=*/0, Size)); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); } // Create an absolute memory reference in order to match against @@ -1082,8 +1093,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( if (!Size) { Size = Info.Type * 8; // Size is in terms of bits in this context. if (Size) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, - /*Len=*/0, Size)); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); } } @@ -1097,13 +1108,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( } static void -RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, +RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, StringRef SymName, int64_t ImmDisp, int64_t FinalImmDisp, SMLoc &BracLoc, SMLoc &StartInBrac, SMLoc &End) { // Remove the '[' and ']' from the IR string. - AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1)); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1)); + AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1); + AsmRewrites.emplace_back(AOK_Skip, End, 1); // If ImmDisp is non-zero, then we parsed a displacement before the // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) @@ -1114,15 +1125,14 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, // We have an immediate displacement before the bracketed expression. // Adjust this to match the final immediate displacement. bool Found = false; - for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), - E = AsmRewrites->end(); I != E; ++I) { - if ((*I).Loc.getPointer() > BracLoc.getPointer()) + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() > BracLoc.getPointer()) continue; - if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) { + if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) { assert (!Found && "ImmDisp already rewritten."); - (*I).Kind = AOK_Imm; - (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer(); - (*I).Val = FinalImmDisp; + AR.Kind = AOK_Imm; + AR.Len = BracLoc.getPointer() - AR.Loc.getPointer(); + AR.Val = FinalImmDisp; Found = true; break; } @@ -1133,28 +1143,27 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, // We have a symbolic and an immediate displacement, but no displacement // before the bracketed expression. Put the immediate displacement // before the bracketed expression. - AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp)); + AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp); } } // Remove all the ImmPrefix rewrites within the brackets. - for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), - E = AsmRewrites->end(); I != E; ++I) { - if ((*I).Loc.getPointer() < StartInBrac.getPointer()) + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() < StartInBrac.getPointer()) continue; - if ((*I).Kind == AOK_ImmPrefix) - (*I).Kind = AOK_Delete; + if (AR.Kind == AOK_ImmPrefix) + AR.Kind = AOK_Delete; } const char *SymLocPtr = SymName.data(); // Skip everything before the symbol. if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); + AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len); } // Skip everything after the symbol. if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len)); + AsmRewrites.emplace_back(AOK_Skip, Loc, Len); } } @@ -1162,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); + AsmToken::TokenKind PrevTK = AsmToken::Error; bool Done = false; while (!Done) { bool UpdateLocLex = true; @@ -1205,7 +1215,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(Tok.getLoc(), "Unexpected identifier!"); } else { // This is a dot operator, not an adjacent identifier. - if (Identifier.find('.') != StringRef::npos) { + if (Identifier.find('.') != StringRef::npos && + PrevTK == AsmToken::RBrac) { return false; } else { InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); @@ -1223,8 +1234,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::Integer: { StringRef ErrMsg; if (isParsingInlineAsm() && SM.getAddImmPrefix()) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, - Tok.getLoc())); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc()); // Look for 'b' or 'f' following an Integer as a directional label SMLoc Loc = getTok().getLoc(); int64_t IntVal = getTok().getIntVal(); @@ -1237,7 +1247,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b"); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; const MCExpr *Val = - MCSymbolRefExpr::create(Sym, Variant, getContext()); + MCSymbolRefExpr::create(Sym, Variant, getContext()); if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); @@ -1275,6 +1285,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (!Done && UpdateLocLex) End = consumeToken(); + + PrevTK = TK; } return false; } @@ -1302,7 +1314,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // A symbolic displacement. Disp = Sym; if (isParsingInlineAsm()) - RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(), + RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(), ImmDisp, SM.getImm(), BracLoc, StartInBrac, End); } @@ -1359,7 +1371,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End) { MCAsmParser &Parser = getParser(); - assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); + assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); @@ -1372,15 +1384,17 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, // Advance the token stream until the end of the current token is // after the end of what the frontend claimed. const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); - while (true) { + do { End = Tok.getEndLoc(); getLexer().Lex(); - - assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); - if (End.getPointer() == EndPtr) break; - } + } while (End.getPointer() < EndPtr); Identifier = LineBuf; + // The frontend should end parsing on an assembler token boundary, unless it + // failed parsing. + assert((End.getPointer() == EndPtr || !Result) && + "frontend claimed part of a token?"); + // If the identifier lookup was unsuccessful, assume that we are dealing with // a label. if (!Result) { @@ -1389,9 +1403,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, Loc, false); assert(InternalName.size() && "We should have an internal name here."); // Push a rewrite for replacing the identifier name with the internal name. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc, - Identifier.size(), - InternalName)); + InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), + InternalName); } // Create the symbol reference. @@ -1418,8 +1431,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, AsmToken ImmDispToken = Parser.Lex(); // Eat the integer. if (isParsingInlineAsm()) - InstInfo->AsmRewrites->push_back( - AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc())); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc()); if (getLexer().isNot(AsmToken::LBrac)) { // An immediate following a 'segment register', 'colon' token sequence can @@ -1588,8 +1600,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); unsigned Len = DotDispStr.size(); unsigned Val = OrigDispVal + DotDispVal; - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len, - Val)); + InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val); } NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext()); @@ -1613,7 +1624,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { return nullptr; // Don't emit the offset operator. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); + InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7); // The offset operator will have an 'r' constraint, thus we need to create // register operand to ensure proper matching. Just pick a GPR based on @@ -1664,7 +1675,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { // Rewrite the type operator and the C or C++ type or variable in terms of an // immediate. E.g. TYPE foo -> $$4 unsigned Len = End.getPointer() - TypeLoc.getPointer(); - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal)); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal); const MCExpr *Imm = MCConstantExpr::create(CVal, getContext()); return X86Operand::CreateImm(Imm, Start, End); @@ -1688,12 +1699,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { return ParseIntelOperator(IOK_TYPE); } + bool PtrInOperand = false; unsigned Size = getIntelMemOperandSize(Tok.getString()); if (Size) { Parser.Lex(); // Eat operand size (e.g., byte, word). if (Tok.getString() != "PTR" && Tok.getString() != "ptr") return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); Parser.Lex(); // Eat ptr. + PtrInOperand = true; } Start = Tok.getLoc(); @@ -1711,10 +1724,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); if (StartTok.getString().size() == Len) // Just add a prefix if this wasn't a complex immediate expression. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start)); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); else // Otherwise, rewrite the complex expression as a single immediate. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm)); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); } if (getLexer().isNot(AsmToken::LBrac)) { @@ -1740,7 +1753,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { } // rounding mode token - if (STI.getFeatureBits()[X86::FeatureAVX512] && + if (getSTI().getFeatureBits()[X86::FeatureAVX512] && getLexer().is(AsmToken::LCurly)) return ParseRoundingModeOp(Start, End); @@ -1749,9 +1762,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { if (!ParseRegister(RegNo, Start, End)) { // If this is a segment register followed by a ':', then this is the start // of a segment override, otherwise this is a normal register reference. - if (getLexer().isNot(AsmToken::Colon)) + // In case it is a normal register and there is ptr in the operand this + // is an error + if (getLexer().isNot(AsmToken::Colon)){ + if (PtrInOperand){ + return ErrorOperand(Start, "expected memory operand after " + "'ptr', found register operand instead"); + } return X86Operand::CreateReg(RegNo, Start, End); - + } + return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); } @@ -1798,7 +1818,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { } case AsmToken::LCurly:{ SMLoc Start = Parser.getTok().getLoc(), End; - if (STI.getFeatureBits()[X86::FeatureAVX512]) + if (getSTI().getFeatureBits()[X86::FeatureAVX512]) return ParseRoundingModeOp(Start, End); return ErrorOperand(Start, "unknown token in expression"); } @@ -1808,7 +1828,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { MCAsmParser &Parser = getParser(); - if(STI.getFeatureBits()[X86::FeatureAVX512]) { + if(getSTI().getFeatureBits()[X86::FeatureAVX512]) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); @@ -1983,12 +2003,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, } // Validate the scale amount. - if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && ScaleVal != 1) { Error(Loc, "scale factor in 16-bit address must be 1"); return nullptr; - } - if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ + } + if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && + ScaleVal != 8) { Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); return nullptr; } @@ -2175,7 +2196,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Name == "repne" || Name == "repnz" || Name == "rex64" || Name == "data16"; - // This does the actual operand parsing. Don't parse any more if we have a // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as @@ -2213,6 +2233,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (isPrefix && getLexer().is(AsmToken::Slash))) Parser.Lex(); + // This is for gas compatibility and cannot be done in td. + // Adding "p" for some floating point with no argument. + // For example: fsub --> fsubp + bool IsFp = + Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr"; + if (IsFp && Operands.size() == 1) { + const char *Repl = StringSwitch<const char *>(Name) + .Case("fsub", "fsubp") + .Case("fdiv", "fdivp") + .Case("fsubr", "fsubrp") + .Case("fdivr", "fdivrp"); + static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl); + } + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -2242,9 +2276,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Append default arguments to "ins[bwld]" if (Name.startswith("ins") && Operands.size() == 1 && - (Name == "insb" || Name == "insw" || Name == "insl" || - Name == "insd" )) { - AddDefaultSrcDestOperands(Operands, + (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) { + AddDefaultSrcDestOperands(Operands, X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), DefaultMemDIOperand(NameLoc)); } @@ -2346,98 +2379,21 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // instalias with an immediate operand yet. if (Name == "int" && Operands.size() == 2) { X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); - if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && - cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) { - Operands.erase(Operands.begin() + 1); - static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); - } + if (Op1.isImm()) + if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm())) + if (CE->getValue() == 3) { + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); + } } return false; } -static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg, - bool isCmp) { - MCInst TmpInst; - TmpInst.setOpcode(Opcode); - if (!isCmp) - TmpInst.addOperand(MCOperand::createReg(Reg)); - TmpInst.addOperand(MCOperand::createReg(Reg)); - TmpInst.addOperand(Inst.getOperand(0)); - Inst = TmpInst; - return true; -} - -static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti16i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::AX, isCmp); -} - -static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti32i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::EAX, isCmp); -} - -static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti64i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); -} - -bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { - switch (Inst.getOpcode()) { - default: return true; - case X86::INT: - X86Operand &Op = static_cast<X86Operand &>(*Ops[1]); - assert(Op.isImm() && "expected immediate"); - int64_t Res; - if (!Op.getImm()->evaluateAsAbsolute(Res) || Res > 255) { - Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]"); - return false; - } - return true; - } - llvm_unreachable("handle the instruction appropriately"); -} - bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; - case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8); - case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8); - case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8); - case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8); - case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8); - case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8); - case X86::OR16i16: return convert16i16to16ri8(Inst, X86::OR16ri8); - case X86::OR32i32: return convert32i32to32ri8(Inst, X86::OR32ri8); - case X86::OR64i32: return convert64i32to64ri8(Inst, X86::OR64ri8); - case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true); - case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true); - case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true); - case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8); - case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8); - case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8); - case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8); - case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8); - case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8); - case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8); - case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8); - case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8); - case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8); - case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8); - case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8); + case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: case X86::VMOVAPSrr: @@ -2457,18 +2413,19 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; - case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; - case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; - case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; - case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; - case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; - case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; - case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; - case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; - case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; - case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; - case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; } Inst.setOpcode(NewOpc); return true; @@ -2573,9 +2530,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, isParsingIntelSyntax())) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: - if (!validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. @@ -2819,9 +2773,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned NumSuccessfulMatches = std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { - if (!validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. @@ -2898,10 +2849,29 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { "a '%' prefix in .intel_syntax"); } return false; - } + } else if (IDVal == ".even") + return parseDirectiveEven(DirectiveID.getLoc()); return true; } +/// parseDirectiveEven +/// ::= .even +bool X86AsmParser::parseDirectiveEven(SMLoc L) { + const MCSection *Section = getStreamer().getCurrentSection().first; + if (getLexer().isNot(AsmToken::EndOfStatement)) { + TokError("unexpected token in directive"); + return false; + } + if (!Section) { + getStreamer().InitSections(false); + Section = getStreamer().getCurrentSection().first; + } + if (Section->UseCodeAlign()) + getStreamer().EmitCodeAlignment(2, 0); + else + getStreamer().EmitValueToAlignment(2, 0, 1, 0); + return false; +} /// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { @@ -2909,10 +2879,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); if (getParser().parseExpression(Value)) return false; - getParser().getStreamer().EmitValue(Value, Size); + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } if (getLexer().is(AsmToken::EndOfStatement)) break; diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 7610806c4578..54538c804a03 100644 --- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -13,30 +13,25 @@ namespace llvm { inline bool isImmSExti16i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value) || + (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value))); } inline bool isImmSExti32i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value) || + (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value))); } inline bool isImmSExti64i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value); } inline bool isImmSExti64i32Value(uint64_t Value) { - return (( Value <= 0x000000007FFFFFFFULL)|| - (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<32>(Value); } inline bool isImmUnsignedi8Value(uint64_t Value) { - return (( Value <= 0x00000000000000FFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isUInt<8>(Value) || isInt<8>(Value); } } // End of namespace llvm diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index fba2c280d200..b23f5c353013 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -34,18 +34,9 @@ set(sources X86VZeroUpper.cpp X86FixupLEAs.cpp X86WinEHState.cpp + X86OptimizeLEAs.cpp ) -if( CMAKE_CL_64 ) - enable_language(ASM_MASM) - ADD_CUSTOM_COMMAND( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj - MAIN_DEPENDENCY X86CompilationCallback_Win64.asm - COMMAND ${CMAKE_ASM_MASM_COMPILER} /nologo /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm - ) - set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj) -endif() - add_llvm_target(X86CodeGen ${sources}) add_subdirectory(AsmParser) diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index cfc3ee2fb08f..ce8fcf164668 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -95,11 +95,13 @@ X86GenericDisassembler::X86GenericDisassembler( llvm_unreachable("Invalid CPU mode"); } +namespace { struct Region { ArrayRef<uint8_t> Bytes; uint64_t Base; Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} }; +} // end anonymous namespace /// A callback function that wraps the readByte method from Region. /// @@ -831,8 +833,12 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM256: case TYPE_XMM512: case TYPE_VK1: + case TYPE_VK2: + case TYPE_VK4: case TYPE_VK8: case TYPE_VK16: + case TYPE_VK32: + case TYPE_VK64: case TYPE_DEBUGREG: case TYPE_CONTROLREG: case TYPE_BNDR: @@ -962,6 +968,7 @@ static bool translateInstruction(MCInst &mcInst, return true; } + mcInst.clear(); mcInst.setOpcode(insn.instructionID); // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 // prefix bytes should be disassembled as xrelease and xacquire then set the diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index f73fa75f888e..040143b15587 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) { * then it should be disassembled as a xacquire/xrelease not repne/rep. */ if ((byte == 0xf2 || byte == 0xf3) && - ((nextByte == 0xf0) | + ((nextByte == 0xf0) || ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) insn->xAcquireRelease = true; /* @@ -980,6 +980,47 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { insn->opcode == 0xE3) attrMask ^= ATTR_ADSIZE; + /* + * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix + * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes + */ + + if (insn->mode == MODE_64BIT && + isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) { + switch (insn->opcode) { + case 0xE8: + case 0xE9: + // Take care of psubsb and other mmx instructions. + if (insn->opcodeType == ONEBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8A: + case 0x8B: + case 0x8C: + case 0x8D: + case 0x8E: + case 0x8F: + // Take care of lea and three byte ops. + if (insn->opcodeType == TWOBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + } + } + if (getIDWithAttrMask(&instructionID, insn, attrMask)) return -1; @@ -1447,8 +1488,12 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_XMM: \ return prefix##_XMM0 + index; \ case TYPE_VK1: \ + case TYPE_VK2: \ + case TYPE_VK4: \ case TYPE_VK8: \ case TYPE_VK16: \ + case TYPE_VK32: \ + case TYPE_VK64: \ if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index a79a923ac525..28a628e5066b 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -572,8 +572,6 @@ struct InternalInstruction { // The last byte of the opcode, not counting any ModR/M extension uint8_t opcode; - // The ModR/M byte of the instruction, if it is an opcode extension - uint8_t modRMExtension; // decode state diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index ea727e6e82fb..b4c0bc4cd4d9 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 62b6b73e7864..bbb309076610 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class X86ATTInstPrinter final : public MCInstPrinter { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 91b144a44824..82f0ee5a5ebc 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -21,6 +21,27 @@ using namespace llvm; +static unsigned getVectorRegSize(unsigned RegNo) { + if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) + return 512; + if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31) + return 256; + if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31) + return 128; + if (X86::MM0 <= RegNo && RegNo <= X86::MM7) + return 64; + + llvm_unreachable("Unknown vector reg!"); + return 0; +} + +static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT, + unsigned OperandIndex) { + unsigned OpReg = MI->getOperand(OperandIndex).getReg(); + return MVT::getVectorVT(ScalarVT, + getVectorRegSize(OpReg)/ScalarVT.getSizeInBits()); +} + /// \brief Extracts the src/dst types for a given zero extension instruction. /// \note While the number of elements in DstVT type correct, the /// number in the SrcVT type is expanded to fill the src xmm register and the @@ -107,6 +128,75 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) { } } +#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: \ + case X86::V##Inst##Suffix##src##k: \ + case X86::V##Inst##Suffix##src##kz: + +#define CASE_SSE_INS_COMMON(Inst, src) \ + case X86::Inst##src: + +#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: + +#define CASE_MOVDUP(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_UNPCK(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_SHUF(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) \ + CASE_AVX_INS_COMMON(Inst, , r##src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src##i) \ + CASE_SSE_INS_COMMON(Inst, r##src##i) \ + +#define CASE_VPERM(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, src##i) \ + CASE_AVX_INS_COMMON(Inst, , src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, src##i) \ + +#define CASE_VSHUF(Inst, src) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \ + +/// \brief Extracts the types and if it has memory operand for a given +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction. +static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) { + HasMemOp = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown VSHUF64x2 family instructions."); + break; + CASE_VSHUF(64X2, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(64X2, r) + VT = getRegOperandVectorVT(MI, MVT::i64, 0); + break; + CASE_VSHUF(32X4, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(32X4, r) + VT = getRegOperandVectorVT(MI, MVT::i32, 0); + break; + } +} + //===----------------------------------------------------------------------===// // Top Level Entrypoint //===----------------------------------------------------------------------===// @@ -127,23 +217,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::BLENDPDrri: case X86::VBLENDPDrri: + case X86::VBLENDPDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::BLENDPDrmi: case X86::VBLENDPDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VBLENDPDYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4f64, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -152,23 +233,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::BLENDPSrri: case X86::VBLENDPSrri: + case X86::VBLENDPSYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::BLENDPSrmi: case X86::VBLENDPSrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPSYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VBLENDPSYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8f32, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -177,23 +249,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PBLENDWrri: case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PBLENDWrmi: case X86::VPBLENDWrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDWYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VPBLENDWYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v16i16, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -201,23 +264,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; case X86::VPBLENDDrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPBLENDDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. + case X86::VPBLENDDrmi: case X86::VPBLENDDYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8i32, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -239,6 +292,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVLHPSrr: case X86::VMOVLHPSrr: + case X86::VMOVLHPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -247,569 +301,327 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVHLPSrr: case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVHLPSMask(2, ShuffleMask); break; - case X86::MOVSLDUPrr: - case X86::VMOVSLDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVSLDUPrm: - case X86::VMOVSLDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); - break; - - case X86::VMOVSHDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVSHDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); - break; - - case X86::VMOVSLDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSLDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVSLDUPYrm: + CASE_MOVDUP(MOVSLDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); + DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - case X86::MOVSHDUPrr: - case X86::VMOVSHDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSHDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::MOVSHDUPrm: - case X86::VMOVSHDUPrm: + CASE_MOVDUP(MOVSHDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); + DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - case X86::VMOVDDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVDDUPYrm: + CASE_MOVDUP(MOVDDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); - break; - - case X86::MOVDDUPrr: - case X86::VMOVDDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVDDUPrm: - case X86::VMOVDDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); break; case X86::PSLLDQri: case X86::VPSLLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSLLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSLLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSLLDQMask(MVT::v32i8, + DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSRLDQri: case X86::VPSRLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSRLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSRLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSRLDQMask(MVT::v32i8, + DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PALIGNR128rr: case X86::VPALIGNR128rr: + case X86::VPALIGNR256rr: Src1Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PALIGNR128rm: case X86::VPALIGNR128rm: - Src2Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePALIGNRMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPALIGNR256rr: - Src1Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VPALIGNR256rm: Src2Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePALIGNRMask(MVT::v32i8, + DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFDri: case X86::VPSHUFDri: + case X86::VPSHUFDYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFDmi: case X86::VPSHUFDmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFDYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFDYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v8i32, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFHWri: case X86::VPSHUFHWri: + case X86::VPSHUFHWYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFHWmi: case X86::VPSHUFHWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFHWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFHWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFHWYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFHWMask(MVT::v16i16, + DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; + case X86::PSHUFLWri: case X86::VPSHUFLWri: + case X86::VPSHUFLWYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFLWmi: case X86::VPSHUFLWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFLWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFLWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFLWYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFLWMask(MVT::v16i16, + DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; - case X86::PUNPCKHBWrr: - case X86::VPUNPCKHBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHBWrm: - case X86::VPUNPCKHBWrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKHBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKHWDrr: - case X86::VPUNPCKHWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHWDrm: - case X86::VPUNPCKHWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); - break; - case X86::VPUNPCKHWDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHWDYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); - break; - case X86::PUNPCKHDQrr: - case X86::VPUNPCKHDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHDQrm: - case X86::VPUNPCKHDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); - break; - case X86::VPUNPCKHDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHDQYrm: + case X86::MMX_PSHUFWri: Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); - break; - case X86::VPUNPCKHDQZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKHDQZrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i32, ShuffleMask); - break; - case X86::PUNPCKHQDQrr: - case X86::VPUNPCKHQDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHQDQrm: - case X86::VPUNPCKHQDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::MMX_PSHUFWmi: DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQZrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i64, ShuffleMask); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(MVT::v4i16, + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); break; - case X86::PUNPCKLBWrr: - case X86::VPUNPCKLBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLBWrm: - case X86::VPUNPCKLBWrm: + case X86::PSWAPDrr: Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKLBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKLWDrr: - case X86::VPUNPCKLWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLWDrm: - case X86::VPUNPCKLWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::PSWAPDrm: DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + DecodePSWAPMask(MVT::v2i32, ShuffleMask); break; - case X86::VPUNPCKLWDYrr: + + CASE_UNPCK(PUNPCKHBW, r) + case X86::MMX_PUNPCKHBWirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLWDYrm: + CASE_UNPCK(PUNPCKHBW, m) + case X86::MMX_PUNPCKHBWirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); break; - case X86::PUNPCKLDQrr: - case X86::VPUNPCKLDQrr: + + CASE_UNPCK(PUNPCKHWD, r) + case X86::MMX_PUNPCKHWDirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLDQrm: - case X86::VPUNPCKLDQrm: + CASE_UNPCK(PUNPCKHWD, m) + case X86::MMX_PUNPCKHWDirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); break; - case X86::VPUNPCKLDQYrr: + + CASE_UNPCK(PUNPCKHDQ, r) + case X86::MMX_PUNPCKHDQirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLDQYrm: + CASE_UNPCK(PUNPCKHDQ, m) + case X86::MMX_PUNPCKHDQirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); break; - case X86::VPUNPCKLDQZrr: + + CASE_UNPCK(PUNPCKHQDQ, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLDQZrm: + CASE_UNPCK(PUNPCKHQDQ, m) Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); break; - case X86::PUNPCKLQDQrr: - case X86::VPUNPCKLQDQrr: + + CASE_UNPCK(PUNPCKLBW, r) + case X86::MMX_PUNPCKLBWirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLQDQrm: - case X86::VPUNPCKLQDQrm: + CASE_UNPCK(PUNPCKLBW, m) + case X86::MMX_PUNPCKLBWirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); break; - case X86::VPUNPCKLQDQYrr: + + CASE_UNPCK(PUNPCKLWD, r) + case X86::MMX_PUNPCKLWDirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLQDQYrm: + CASE_UNPCK(PUNPCKLWD, m) + case X86::MMX_PUNPCKLWDirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); break; - case X86::VPUNPCKLQDQZrr: + + CASE_UNPCK(PUNPCKLDQ, r) + case X86::MMX_PUNPCKLDQirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLQDQZrm: + CASE_UNPCK(PUNPCKLDQ, m) + case X86::MMX_PUNPCKLDQirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); break; - case X86::SHUFPDrri: - case X86::VSHUFPDrri: + CASE_UNPCK(PUNPCKLQDQ, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::SHUFPDrmi: - case X86::VSHUFPDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VSHUFPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VSHUFPDYrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); + CASE_UNPCK(PUNPCKLQDQ, m) Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); break; - case X86::SHUFPSrri: - case X86::VSHUFPSrri: + CASE_SHUF(SHUFPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::SHUFPSrmi: - case X86::VSHUFPSrmi: + CASE_SHUF(SHUFPD, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v4f32, + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VSHUFPSYrri: + + CASE_SHUF(SHUFPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VSHUFPSYrmi: + CASE_SHUF(SHUFPS, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v8f32, + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::UNPCKLPDrr: - case X86::VUNPCKLPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPDrm: - case X86::VUNPCKLPDrm: - DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDYrm: - DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDZrm: - DecodeUNPCKLMask(MVT::v8f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKLPSrr: - case X86::VUNPCKLPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPSrm: - case X86::VUNPCKLPSrm: - DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSYrm: - DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSZrm: - DecodeUNPCKLMask(MVT::v16f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKHPDrr: - case X86::VUNPCKHPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKHPDrm: - case X86::VUNPCKHPDrm: - DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKHPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKHPDYrm: - DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_VSHUF(64X2, r) + CASE_VSHUF(64X2, m) + CASE_VSHUF(32X4, r) + CASE_VSHUF(32X4, m) { + MVT VT; + bool HasMemOp; + unsigned NumOp = MI->getNumOperands(); + getVSHUF64x2FamilyInfo(MI, VT, HasMemOp); + decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); + if (HasMemOp) { + assert((NumOp >= 8) && "Expected at least 8 operands!"); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + } else { + assert((NumOp >= 4) && "Expected at least 4 operands!"); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + } break; - case X86::VUNPCKHPDZrr: + } + + CASE_UNPCK(UNPCKLPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPDZrm: - DecodeUNPCKHMask(MVT::v8f64, ShuffleMask); + CASE_UNPCK(UNPCKLPD, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::UNPCKHPSrr: - case X86::VUNPCKHPSrr: + + CASE_UNPCK(UNPCKLPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::UNPCKHPSrm: - case X86::VUNPCKHPSrm: - DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); + CASE_UNPCK(UNPCKLPS, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VUNPCKHPSYrr: + + CASE_UNPCK(UNPCKHPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPSYrm: - DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); + CASE_UNPCK(UNPCKHPD, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VUNPCKHPSZrr: + + CASE_UNPCK(UNPCKHPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPSZrm: - DecodeUNPCKHMask(MVT::v16f32, ShuffleMask); + CASE_UNPCK(UNPCKHPS, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VPERMILPSri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPSYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSYmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPDri: + + CASE_VPERM(PERMILPS, r) Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. - case X86::VPERMILPDmi: + CASE_VPERM(PERMILPS, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v2f64, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VPERMILPDYri: + + CASE_VPERM(PERMILPD, r) Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. - case X86::VPERMILPDYmi: + CASE_VPERM(PERMILPD, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4f64, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERM2F128rr: case X86::VPERM2I128rr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -824,6 +636,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERMQYri: case X86::VPERMPDYri: Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -846,6 +659,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::MOVSSrr: case X86::VMOVSSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -861,6 +675,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVZPQILo2PQIrr: case X86::VMOVPQI2QIrr: case X86::VMOVZPQILo2PQIrr: + case X86::VMOVZPQILo2PQIZrr: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::MOVQI2PQIrm: @@ -869,9 +684,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VMOVQI2PQIrm: case X86::VMOVZQI2PQIrm: case X86::VMOVZPQILo2PQIrm: + case X86::VMOVZPQILo2PQIZrm: DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::MOVDI2PDIrm: case X86::VMOVDI2PDIrm: DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask); diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 6e371da37290..20cd7ffb2e63 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -19,8 +19,6 @@ namespace llvm { -class MCOperand; - class X86IntelInstPrinter final : public MCInstPrinter { public: X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 629802f5dc5e..133bd0e1772a 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -69,15 +69,19 @@ public: class X86AsmBackend : public MCAsmBackend { const StringRef CPU; bool HasNopl; - const uint64_t MaxNopLength; + uint64_t MaxNopLength; public: - X86AsmBackend(const Target &T, StringRef CPU) - : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) { + X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && CPU != "c3" && CPU != "c3-2"; + // Max length of true long nop instruction is 15 bytes. + // Max length of long nop replacement instruction is 7 bytes. + // Taking into account SilverMont architecture features max length of nops + // is reduced for it to achieve better performance. + MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15; } unsigned getNumFixupKinds() const override { @@ -200,6 +204,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::ADD64ri8: return X86::ADD64ri32; case X86::ADD64mi8: return X86::ADD64mi32; + // ADC + case X86::ADC16ri8: return X86::ADC16ri; + case X86::ADC16mi8: return X86::ADC16mi; + case X86::ADC32ri8: return X86::ADC32ri; + case X86::ADC32mi8: return X86::ADC32mi; + case X86::ADC64ri8: return X86::ADC64ri32; + case X86::ADC64mi8: return X86::ADC64mi32; + // SUB case X86::SUB16ri8: return X86::SUB16ri; case X86::SUB16mi8: return X86::SUB16mi; @@ -208,6 +220,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::SUB64ri8: return X86::SUB64ri32; case X86::SUB64mi8: return X86::SUB64mi32; + // SBB + case X86::SBB16ri8: return X86::SBB16ri; + case X86::SBB16mi8: return X86::SBB16mi; + case X86::SBB32ri8: return X86::SBB32ri; + case X86::SBB32mi8: return X86::SBB32mi; + case X86::SBB64ri8: return X86::SBB64ri32; + case X86::SBB64mi8: return X86::SBB64mi32; + // CMP case X86::CMP16ri8: return X86::CMP16ri; case X86::CMP16mi8: return X86::CMP16mi; @@ -279,7 +299,7 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { /// bytes. /// \return - true on success, false on failure bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - static const uint8_t Nops[10][10] = { + static const uint8_t TrueNops[10][10] = { // nop {0x90}, // xchg %ax,%ax @@ -302,17 +322,31 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, }; - // This CPU doesn't support long nops. If needed add more. - // FIXME: Can we get this from the subtarget somehow? - // FIXME: We could generated something better than plain 0x90. - if (!HasNopl) { - for (uint64_t i = 0; i < Count; ++i) - OW->write8(0x90); - return true; - } + // Alternative nop instructions for CPUs which don't support long nops. + static const uint8_t AltNops[7][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // lea 0x0(%esi),%esi + {0x8d, 0x76, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0x74, 0x26, 0x00}, + // nop + lea 0x0(%esi),%esi + {0x90, 0x8d, 0x74, 0x26, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 }, + // lea 0x0(%esi),%esi + {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00}, + }; + + // Select the right NOP table. + // FIXME: Can we get if CPU supports long nops from the subtarget somehow? + const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops; + assert(HasNopl || MaxNopLength <= 7); - // 15 is the longest single nop instruction. Emit as many 15-byte nops as - // needed, then emit a nop of the remaining length. + // Emit as many largest nops as needed, then emit a nop of the remaining + // length. do { const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; @@ -359,6 +393,17 @@ public: } }; +class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend { +public: + ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_IAMCU); + } +}; + class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) @@ -610,13 +655,13 @@ private: /// \brief Get the compact unwind number for a given register. The number /// corresponds to the enum lists in compact_unwind_encoding.h. int getCompactUnwindRegNum(unsigned Reg) const { - static const uint16_t CU32BitRegs[7] = { + static const MCPhysReg CU32BitRegs[7] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; - static const uint16_t CU64BitRegs[] = { + static const MCPhysReg CU64BitRegs[] = { X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 }; - const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; + const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; @@ -780,6 +825,10 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, return new WindowsX86AsmBackend(T, false, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.isOSIAMCU()) + return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU); + return new ELFX86_32AsmBackend(T, OSABI, CPU); } diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index f0d00b0c1bc3..9ff85b9154f8 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -41,6 +41,16 @@ namespace X86 { /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; + + /// AVX512 static rounding constants. These need to match the values in + /// avx512fintrin.h. + enum STATIC_ROUNDING { + TO_NEAREST_INT = 0, + TO_NEG_INF = 1, + TO_POS_INF = 2, + TO_ZERO = 3, + CUR_DIRECTION = 4 + }; } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that @@ -675,7 +685,7 @@ namespace X86II { case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: - return -1; + return -1; case X86II::MRMDestMem: return 0; case X86II::MRMSrcMem: @@ -696,23 +706,27 @@ namespace X86II { // Start from 0, skip registers encoded in VEX_VVVV or a mask register. return 0 + HasVEX_4V + HasEVEX_K; case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: - case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: - case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: - case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: - case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: - case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: - case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: - case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: - case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: - case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: - case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: - case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: return -1; } } @@ -740,7 +754,7 @@ namespace X86II { case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: - return true; + return true; } return false; } diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index a33468dc4769..736c39dfb6f1 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -32,9 +32,11 @@ namespace { X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) - : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, - // Only i386 uses Rel instead of RelA. - /*HasRelocationAddend*/ EMachine != ELF::EM_386) {} + : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, + // Only i386 and IAMCU use Rel instead of RelA. + /*HasRelocationAddend*/ + (EMachine != ELF::EM_386) && + (EMachine != ELF::EM_IAMCU)) {} X86ELFObjectWriter::~X86ELFObjectWriter() {} @@ -246,7 +248,8 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, if (getEMachine() == ELF::EM_X86_64) return getRelocType64(Modifier, Type, IsPCRel); - assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type."); + assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) && + "Unsupported ELF machine type."); return getRelocType32(Modifier, getType32(Type), IsPCRel); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index deaad2a5b8e8..30d5c802d1ed 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -20,39 +20,42 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; - - class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit X86MCAsmInfoDarwin(const Triple &Triple); - }; - - struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { - explicit X86_64MCAsmInfoDarwin(const Triple &Triple); - const MCExpr * - getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, - MCStreamer &Streamer) const override; - }; - - class X86ELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit X86ELFMCAsmInfo(const Triple &Triple); - }; - - class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { - void anchor() override; - public: - explicit X86MCAsmInfoMicrosoft(const Triple &Triple); - }; - - class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { - void anchor() override; - public: - explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); - }; +class Triple; + +class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit X86MCAsmInfoDarwin(const Triple &Triple); +}; + +struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; +}; + +class X86ELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit X86ELFMCAsmInfo(const Triple &Triple); +}; + +class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); +}; + +class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); +}; } // namespace llvm #endif diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 10c434c8b1b4..dfab6ec10775 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -510,8 +510,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, emit the most general non-SIB encoding: [REG+disp32] EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, - Fixups); + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); return; } @@ -988,6 +988,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, const MCInstrDesc &Desc) { unsigned REX = 0; + bool UsesHighByteReg = false; + if (TSFlags & X86II::REX_W) REX |= 1 << 3; // set REX.W @@ -1004,6 +1006,8 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, const MCOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) + UsesHighByteReg = true; if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything // that returns non-zero. @@ -1073,6 +1077,9 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, } break; } + if (REX && UsesHighByteReg) + report_fatal_error("Cannot encode high byte register in REX-prefixed instruction"); + return REX; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 83b4091d7665..53a6550acdd5 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -122,7 +122,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, } else if (TheTriple.isOSBinFormatELF()) { // Force the use of an ELF container. MAI = new X86ELFMCAsmInfo(TheTriple); - } else if (TheTriple.isWindowsMSVCEnvironment()) { + } else if (TheTriple.isWindowsMSVCEnvironment() || + TheTriple.isWindowsCoreCLREnvironment()) { MAI = new X86MCAsmInfoMicrosoft(TheTriple); } else if (TheTriple.isOSCygMing() || TheTriple.isWindowsItaniumEnvironment()) { @@ -267,3 +268,184 @@ extern "C" void LLVMInitializeX86TargetMC() { TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, createX86_64AsmBackend); } + +unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, + bool High) { + switch (Size) { + default: return 0; + case 8: + if (High) { + switch (Reg) { + default: return getX86SubSuperRegisterOrZero(Reg, 64); + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case 16: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case 32: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case 64: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } +} + +unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) { + unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High); + assert(Res != 0 && "Unexpected register or VT"); + return Res; +} + + diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 6221baba1793..2d2836ff07c5 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -79,7 +79,7 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, /// Takes ownership of \p AB and \p CE. MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, MCCodeEmitter *CE, - bool RelaxAll); + bool RelaxAll, bool IncrementalLinkerCompatible); /// Construct an X86 Mach-O object writer. MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, @@ -98,6 +98,17 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); /// Construct X86-64 ELF relocation info. MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); + +/// Returns the sub or super register of a specific X86 register. +/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. +/// Aborts on error. +unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false); + +/// Returns the sub or super register of a specific X86 register. +/// Like getX86SubSuperRegister() but returns 0 on error. +unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, + bool High = false); + } // End llvm namespace diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 9e801fc8f191..191ebeac7265 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation( // Neither symbol can be modified. if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported relocation of modified symbol", false); + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } // We don't support PCrel relocations of differences. Darwin 'as' doesn't // implement most of these correctly. - if (IsPCRel) - report_fatal_error("unsupported pc-relative relocation of difference", - false); + if (IsPCRel) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported pc-relative relocation of difference"); + return; + } // The support for the situation where one or both of the symbols would // require a local relocation is handled just like if the symbols were @@ -168,16 +173,20 @@ void X86MachObjectWriter::RecordX86_64Relocation( // Darwin 'as' doesn't emit correct relocations for this (it ends up with a // single SIGNED relocation); reject it for now. Except the case where both // symbols don't have a base, equal but both NULL. - if (A_Base == B_Base && A_Base) - report_fatal_error("unsupported relocation with identical base", false); + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } // A subtraction expression where either symbol is undefined is a // non-relocatable expression. if (A->isUndefined() || B->isUndefined()) { StringRef Name = A->isUndefined() ? A->getName() : B->getName(); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation with subtraction expression, symbol '" + Name + "' can not be undefined in a subtraction expression"); + return; } Value += Writer->getSymbolAddress(*A, Layout) - @@ -244,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation( FixedValue = Res; return; } else { - report_fatal_error("unsupported relocation of variable '" + - Symbol->getName() + "'", false); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of variable '" + + Symbol->getName() + "'"); + return; } } else { - report_fatal_error("unsupported relocation of undefined symbol '" + - Symbol->getName() + "'", false); + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation of undefined symbol '" + + Symbol->getName() + "'"); + return; } MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); @@ -266,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation( } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { Type = MachO::X86_64_RELOC_TLV; } else if (Modifier != MCSymbolRefExpr::VK_None) { - report_fatal_error("unsupported symbol modifier in relocation", - false); + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; } else { Type = MachO::X86_64_RELOC_SIGNED; @@ -292,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation( } } } else { - if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in branch " - "relocation", false); + if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported symbol modifier in branch relocation"); + return; + } Type = MachO::X86_64_RELOC_BRANCH; } @@ -309,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation( Type = MachO::X86_64_RELOC_GOT; IsPCRel = 1; } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { - report_fatal_error("TLVP symbol modifier should have been rip-rel", - false); - } else if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in relocation", false); - else { + Asm.getContext().reportError( + Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel"); + return; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; + } else { Type = MachO::X86_64_RELOC_UNSIGNED; unsigned Kind = Fixup.getKind(); - if (Kind == X86::reloc_signed_4byte) - report_fatal_error("32-bit absolute addressing is not supported in " - "64-bit mode", false); + if (Kind == X86::reloc_signed_4byte) { + Asm.getContext().reportError( + Fixup.getLoc(), + "32-bit absolute addressing is not supported in 64-bit mode"); + return; + } } } } @@ -350,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - report_fatal_error("symbol '" + A->getName() + - "' can not be undefined in a subtraction expression", - false); + if (!A->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + return false; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); @@ -363,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (const MCSymbolRefExpr *B = Target.getSymB()) { const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - report_fatal_error("symbol '" + B->getSymbol().getName() + - "' can not be undefined in a subtraction expression", - false); + if (!SB->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + return false; + } // Select the appropriate difference relocation type. // @@ -387,12 +416,12 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (FixupOffset > 0xffffff) { char Buffer[32]; format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), Twine("Section too large, can't encode " "r_address (") + Buffer + ") into 24 bits of scattered " "relocation entry."); - llvm_unreachable("fatal error returned?!"); + return false; } MachO::any_relocation_info MRE; diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 92f42b68ae51..d04511873b46 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -50,9 +50,11 @@ void X86WinCOFFStreamer::FinishImpl() { MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, - MCCodeEmitter *CE, bool RelaxAll) { + MCCodeEmitter *CE, bool RelaxAll, + bool IncrementalLinkerCompatible) { X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); S->getAssembler().setRelaxAll(RelaxAll); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); return S; } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index cae865a40819..4fdd527d87c8 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -140,13 +140,14 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm, } } -/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. +/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; unsigned NewImm = Imm; @@ -191,6 +192,16 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm, } } +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumHalfElts = NumElts / 2; + + for (unsigned l = 0; l != NumHalfElts; ++l) + ShuffleMask.push_back(l + NumHalfElts); + for (unsigned h = 0; h != NumHalfElts; ++h) + ShuffleMask.push_back(h); +} + /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. @@ -222,7 +233,7 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + if (NumLanes == 0) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { @@ -253,6 +264,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { } } +/// \brief Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } +} + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned HalfSize = VT.getVectorNumElements() / 2; @@ -277,10 +308,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { // <4 x i32> <i32 -2147483648, i32 -2147483648, // i32 -2147483648, i32 -2147483648> +#ifndef NDEBUG unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - - if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. - return; + assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); +#endif // This is a straightforward byte vector. if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { @@ -290,7 +321,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { for (int i = 0; i < NumElements; ++i) { // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte // lane of the vector we're inside. - int Base = i < 16 ? 0 : 16; + int Base = i & ~0xf; Constant *COp = C->getAggregateElement(i); if (!COp) { ShuffleMask.clear(); @@ -357,44 +388,66 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); - assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); - assert(MaskTy->getVectorElementType()->isIntegerTy() && - "Expected integer constant mask elements!"); - int ElementBits = MaskTy->getScalarSizeInBits(); - int NumElements = MaskTy->getVectorNumElements(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // Only support vector types. + if (!MaskTy->isVectorTy()) + return; + + // Make sure its an integer type. + Type *VecEltTy = MaskTy->getVectorElementType(); + if (!VecEltTy->isIntegerTy()) + return; + + // Support any element type from byte up to element size. + // This is necesary primarily because 64-bit elements get split to 32-bit + // in the constant pool on 32-bit target. + unsigned EltTySize = VecEltTy->getIntegerBitWidth(); + if (EltTySize < 8 || EltTySize > ElSize) + return; + + unsigned NumElements = MaskTySize / ElSize; assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && "Unexpected number of vector elements."); ShuffleMask.reserve(NumElements); - if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { - assert((unsigned)NumElements == CDS->getNumElements() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - uint64_t Element = CDS->getElementAsInteger(i); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } else if (auto *CV = dyn_cast<ConstantVector>(C)) { - assert((unsigned)NumElements == C->getNumOperands() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - Constant *COp = CV->getOperand(i); - if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); + unsigned NumElementsPerLane = 128 / ElSize; + unsigned Factor = ElSize / EltTySize; + + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i * Factor); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; } + int Index = i & ~(NumElementsPerLane - 1); + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + if (ElSize == 64) + Index += (Element >> 1) & 0x1; + else + Index += Element & 0x3; + ShuffleMask.push_back(Index); } + + // TODO: Handle funny-looking vectors too. } void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) { @@ -503,4 +556,74 @@ void DecodeINSERTQIMask(int Len, int Idx, ShuffleMask.push_back(SM_SentinelUndef); } +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + if (MaskTy->isVectorTy()) { + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements) - 1; + ShuffleMask.push_back(Element); + } + } + } + return; + } + // Scalar value; just broadcast it + if (!isa<ConstantInt>(C)) + return; + uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); + int NumElements = VT.getVectorNumElements(); + Element &= (1 << NumElements) - 1; + for (int i = 0; i < NumElements; ++i) + ShuffleMask.push_back(Element); +} + +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements*2) - 1; + ShuffleMask.push_back(Element); + } + } + } +} } // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 3d10d18e860e..ab18e6438ec9 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -54,6 +54,9 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decodes a PSWAPD 3DNow! instruction. +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. @@ -83,12 +86,18 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a shuffle packed values at 128-bit granularity +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a VPERMILP variable mask from an IR-level vector constant. -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, @@ -108,6 +117,22 @@ void DecodeEXTRQIMask(int Len, int Idx, /// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. void DecodeINSERTQIMask(int Len, int Idx, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace #endif diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 8403ae6101df..fbec6626d99d 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -23,56 +23,47 @@ class FunctionPass; class ImmutablePass; class X86TargetMachine; -/// createX86ISelDag - This pass converts a legalized DAG into a -/// X86-specific DAG, ready for instruction scheduling. -/// +/// This pass converts a legalized DAG into a X86-specific DAG, ready for +/// instruction scheduling. FunctionPass *createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel); -/// createX86GlobalBaseRegPass - This pass initializes a global base -/// register for PIC on x86-32. +/// This pass initializes a global base register for PIC on x86-32. FunctionPass* createX86GlobalBaseRegPass(); -/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses -/// to local-dynamic TLS variables so that the TLS base address for the module -/// is only fetched once per execution path through the function. +/// This pass combines multiple accesses to local-dynamic TLS variables so that +/// the TLS base address for the module is only fetched once per execution path +/// through the function. FunctionPass *createCleanupLocalDynamicTLSPass(); -/// createX86FloatingPointStackifierPass - This function returns a pass which -/// converts floating point register references and pseudo instructions into -/// floating point stack references and physical instructions. -/// +/// This function returns a pass which converts floating-point register +/// references and pseudo instructions into floating-point stack references and +/// physical instructions. FunctionPass *createX86FloatingPointStackifierPass(); -/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions -/// before each call to avoid transition penalty between functions encoded with -/// AVX and SSE. +/// This pass inserts AVX vzeroupper instructions before each call to avoid +/// transition penalty between functions encoded with AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// createX86EmitCodeToMemory - Returns a pass that converts a register -/// allocated function into raw machine code in a dynamically -/// allocated chunk of memory. -/// -FunctionPass *createEmitX86CodeToMemory(); - -/// createX86PadShortFunctions - Return a pass that pads short functions -/// with NOOPs. This will prevent a stall when returning on the Atom. +/// Return a pass that pads short functions with NOOPs. +/// This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); -/// createX86FixupLEAs - Return a a pass that selectively replaces -/// certain instructions (like add, sub, inc, dec, some shifts, -/// and some multiplies) by equivalent LEA instructions, in order -/// to eliminate execution delays in some Atom processors. + +/// Return a a pass that selectively replaces certain instructions (like add, +/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA +/// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); -/// createX86CallFrameOptimization - Return a pass that optimizes -/// the code-size of x86 call sequences. This is done by replacing -/// esp-relative movs with pushes. +/// Return a pass that removes redundant address recalculations. +FunctionPass *createX86OptimizeLEAs(); + +/// Return a pass that optimizes the code-size of x86 call sequences. This is +/// done by replacing esp-relative movs with pushes. FunctionPass *createX86CallFrameOptimization(); -/// createX86WinEHStatePass - Return an IR pass that inserts EH registration -/// stack objects and explicit EH state updates. This pass must run after EH -/// preparation, which does Windows-specific but architecture-neutral -/// preparation. +/// Return an IR pass that inserts EH registration stack objects and explicit +/// EH state updates. This pass must run after EH preparation, which does +/// Windows-specific but architecture-neutral preparation. FunctionPass *createX86WinEHStatePass(); /// Return a Machine IR pass that expands X86-specific pseudo diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 852267400bba..8902a8534256 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -37,14 +37,26 @@ def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", "Support POPCNT instruction">; +def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true", + "Support fxsave/fxrestore instructions">; + +def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", + "Support xsave instructions">; + +def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", + "Support xsaveopt instructions">; + +def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", + "Support xsavec instructions">; + +def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", + "Support xsaves instructions">; -def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", - "Enable MMX instructions">; def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions", // SSE codegen depends on cmovs, and all // SSE1+ processors support them. - [FeatureMMX, FeatureCMOV]>; + [FeatureCMOV]>; def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", "Enable SSE2 instructions", [FeatureSSE1]>; @@ -60,6 +72,11 @@ def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41]>; +// The MMX subtarget feature is separate from the rest of the SSE features +// because it's important (for odd compatibility reasons) to be able to +// turn it off explicitly while allowing SSE+ to be on. +def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", + "Enable MMX instructions">; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", "Enable 3DNow! instructions", [FeatureMMX]>; @@ -79,16 +96,13 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; -// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that -// explicit. Also, it seems this would be the default state for most chips -// going forward, so it would probably be better to negate the logic and -// match the 32-byte "slow mem" feature below. -def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", - "IsUAMemFast", "true", - "Fast unaligned memory access">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", - "IsUAMem32Slow", "true", - "Slow unaligned 32-byte memory access">; + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -120,6 +134,8 @@ def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", "Enable AVX-512 Vector Length eXtensions", [FeatureAVX512]>; +def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", + "Enable protection keys">; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -168,9 +184,11 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", "Support MPX instructions">; -def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", +def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", @@ -181,6 +199,11 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; +// TODO: This feature ought to be renamed. +// What it really refers to are CPUs for which certain instructions +// (which ones besides the example below?) are microcoded. +// The best examples of this are the memory forms of CALL and PUSH +// instructions, which should be avoided in favor of a MOV + register CALL/PUSH. def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", "CallRegIndirect", "true", "Call register indirect">; @@ -208,278 +231,473 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; -def : Proc<"generic", []>; -def : Proc<"i386", []>; -def : Proc<"i486", []>; -def : Proc<"i586", []>; -def : Proc<"pentium", []>; -def : Proc<"pentium-mmx", [FeatureMMX]>; -def : Proc<"i686", []>; -def : Proc<"pentiumpro", [FeatureCMOV]>; -def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; -def : Proc<"pentium3", [FeatureSSE1]>; -def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureSSE2]>; -def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"generic", [FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"i686", [FeatureSlowUAMem16]>; +def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV, + FeatureFXSR]>; +def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR]>; +def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR]>; +def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureSSE3, FeatureSlowBTMem]>; + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : Proc<"prescott", + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; +def : Proc<"nocona", [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem +]>; // Intel Core 2 Solo/Duo. -def : ProcessorModel<"core2", SandyBridgeModel, - [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -def : ProcessorModel<"penryn", SandyBridgeModel, - [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : ProcessorModel<"core2", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; +def : ProcessorModel<"penryn", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE41, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; // Atom CPUs. class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ - ProcIntelAtom, - FeatureSSSE3, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeatureSlowBTMem, - FeatureLeaForSP, - FeatureSlowDivide32, - FeatureSlowDivide64, - FeatureCallRegIndirect, - FeatureLEAUsesAG, - FeaturePadShortFunctions - ]>; + ProcIntelAtom, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowBTMem, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureLAHFSAHF +]>; def : BonnellProc<"bonnell">; def : BonnellProc<"atom">; // Pin the generic name to the baseline. class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ - ProcIntelSLM, - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureAES, - FeatureSlowDivide64, - FeatureCallRegIndirect, - FeaturePRFCHW, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureSlowBTMem, - FeatureFastUAMem - ]>; + ProcIntelSLM, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. // "Arrandale" along with corei3 and corei5 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT - ]>; + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureLAHFSAHF +]>; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL - ]>; + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureLAHFSAHF +]>; def : WestmereProc<"westmere">; // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureAVX, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeatureSlowUAMem32, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL - ]>; + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF +]>; def : SandyBridgeProc<"sandybridge">; def : SandyBridgeProc<"corei7-avx">; // Legacy alias. class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureAVX, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeatureSlowUAMem32, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase - ]>; + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; def : IvyBridgeProc<"ivybridge">; def : IvyBridgeProc<"core-avx-i">; // Legacy alias. class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [ - FeatureAVX2, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeatureRTM, - FeatureHLE, - FeatureSlowIncDec - ]>; + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ - FeatureAVX2, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeatureRTM, - FeatureHLE, - FeatureADX, - FeatureRDSEED, - FeatureSlowIncDec - ]>; + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; def : BroadwellProc<"broadwell">; // FIXME: define KNL model -class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, - [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, - FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureMPX]>; +class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureMPX, + FeatureLAHFSAHF +]>; def : KnightsLandingProc<"knl">; // FIXME: define SKX model -class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, - [FeatureAVX512, FeatureCDI, - FeatureDQI, FeatureBWI, FeatureVLX, - FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureMPX]>; +class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureMPX, + FeatureXSAVEC, + FeatureXSAVES, + FeatureLAHFSAHF +]>; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. // AMD CPUs. -def : Proc<"k6", [FeatureMMX]>; -def : Proc<"k6-2", [Feature3DNow]>; -def : Proc<"k6-3", [Feature3DNow]>; -def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, +def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, +def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"amdfam10", [FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, +def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"barcelona", [FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, +def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; +def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; + // Bobcat -def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowSHLD]>; +def : Proc<"btver1", [ + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Jaguar -def : ProcessorModel<"btver2", BtVer2Model, - [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureAES, FeaturePCLMUL, - FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD]>; - -// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. +def : ProcessorModel<"btver2", BtVer2Model, [ + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Bulldozer -def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowSHLD]>; +def : Proc<"bdver1", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Piledriver -def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureF16C, - FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD]>; +def : Proc<"bdver2", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Steamroller -def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureF16C, - FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD, - FeatureFSGSBase]>; +def : Proc<"bdver3", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; // Excavator -def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, - FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, - FeaturePCLMUL, FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureBMI2, - FeatureTBM, FeatureFMA, FeatureSSE4A, - FeatureFSGSBase]>; - -def : Proc<"geode", [Feature3DNowA]>; - -def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [Feature3DNow]>; -def : Proc<"c3", [Feature3DNow]>; -def : Proc<"c3-2", [FeatureSSE1]>; +def : Proc<"bdver4", [ + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureBMI2, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; + +def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -492,8 +710,8 @@ def : Proc<"c3-2", [FeatureSSE1]>; // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, - [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem]>; + [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem ]>; //===----------------------------------------------------------------------===// // Register File Description @@ -520,10 +738,6 @@ include "X86CallingConv.td" // Assembly Parser //===----------------------------------------------------------------------===// -def ATTAsmParser : AsmParser { - string AsmParserClassName = "AsmParser"; -} - def ATTAsmParserVariant : AsmParserVariant { int Variant = 0; @@ -568,7 +782,6 @@ def IntelAsmWriter : AsmWriter { def X86 : Target { // Information about the instructions... let InstructionSet = X86InstrInfo; - let AssemblyParsers = [ATTAsmParser]; let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; } diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index ba33248d2039..2170e62e30fd 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -217,10 +217,10 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, if (AsmVariant == 0) O << '%'; unsigned Reg = MO.getReg(); if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ? - MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : - ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); - Reg = getX86SubSuperRegister(Reg, VT); + unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : + (strcmp(Modifier+6,"32") == 0) ? 32 : + (strcmp(Modifier+6,"16") == 0) ? 16 : 8; + Reg = getX86SubSuperRegister(Reg, Size); } O << X86ATTInstPrinter::getRegisterName(Reg); return; @@ -361,22 +361,21 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register - Reg = getX86SubSuperRegister(Reg, MVT::i8); + Reg = getX86SubSuperRegister(Reg, 8); break; case 'h': // Print QImode high register - Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + Reg = getX86SubSuperRegister(Reg, 8, true); break; case 'w': // Print HImode register - Reg = getX86SubSuperRegister(Reg, MVT::i16); + Reg = getX86SubSuperRegister(Reg, 16); break; case 'k': // Print SImode register - Reg = getX86SubSuperRegister(Reg, MVT::i32); + Reg = getX86SubSuperRegister(Reg, 32); break; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. - MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32; - Reg = getX86SubSuperRegister(Reg, Ty); + Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32); break; } @@ -535,6 +534,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { S, MCConstantExpr::create(int64_t(1), MMI->getContext())); } } + OutStreamer->EmitSyntaxDirective(); } static void @@ -565,10 +565,11 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { - SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); + const DataLayout &DL = MF->getDataLayout(); + SectionKind Kind = CPE.getSectionKind(&DL); const Constant *C = CPE.Val.ConstVal; if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( - getObjFileLowering().getSectionForConstant(Kind, C))) { + getObjFileLowering().getSectionForConstant(DL, Kind, C))) { if (MCSymbol *Sym = S->getCOMDATSymbol()) { if (Sym->isUndefined()) OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global); diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index 7f5d127c68d5..9c8bd98dbade 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -78,8 +78,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // outputting it to the OutStream. This allows the shadow tracker to minimise // the number of NOPs used for stackmap padding. void EmitAndCountInstruction(MCInst &Inst); - - void InsertStackMapShadows(MachineFunction &MF); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 031ba4ba9e66..fc6ee1752f1f 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" @@ -53,10 +54,13 @@ private: // Information we know about a particular call site struct CallContext { CallContext() - : Call(nullptr), SPCopy(nullptr), ExpectedDist(0), - MovVector(4, nullptr), NoStackParams(false), UsePush(false){}; + : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0), + MovVector(4, nullptr), NoStackParams(false), UsePush(false){} - // Actuall call instruction + // Iterator referring to the frame setup instruction + MachineBasicBlock::iterator FrameSetup; + + // Actual call instruction MachineInstr *Call; // A copy of the stack pointer @@ -75,17 +79,16 @@ private: bool UsePush; }; - typedef DenseMap<MachineInstr *, CallContext> ContextMap; + typedef SmallVector<CallContext, 8> ContextVector; bool isLegal(MachineFunction &MF); - bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); + bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap); void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, CallContext &Context); - bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I, - const CallContext &Context); + bool adjustCallSequence(MachineFunction &MF, const CallContext &Context); MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, unsigned Reg); @@ -100,7 +103,8 @@ private: const char *getPassName() const override { return "X86 Optimize Call Frame"; } const TargetInstrInfo *TII; - const TargetFrameLowering *TFL; + const X86FrameLowering *TFL; + const X86Subtarget *STI; const MachineRegisterInfo *MRI; static char ID; }; @@ -124,8 +128,15 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // No point in running this in 64-bit mode, since some arguments are // passed in-register in all common calling conventions, so the pattern // we're looking for will never match. - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - if (STI.is64Bit()) + if (STI->is64Bit()) + return false; + + // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset + // in the compact unwind encoding that Darwin uses. So, bail if there + // is a danger of that being generated. + if (STI->isTargetDarwin() && + (!MF.getMMI().getLandingPads().empty() || + (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) return false; // You would expect straight-line code between call-frame setup and @@ -161,7 +172,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // Check whether this trasnformation is profitable for a particular // function - in terms of code size. bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, - ContextMap &CallSeqMap) { + ContextVector &CallSeqVector) { // This transformation is always a win when we do not expect to have // a reserved call frame. Under other circumstances, it may be either // a win or a loss, and requires a heuristic. @@ -170,24 +181,20 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, return true; // Don't do this when not optimizing for size. - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize); - - if (!OptForSize) + if (!MF.getFunction()->optForSize()) return false; unsigned StackAlign = TFL->getStackAlignment(); int64_t Advantage = 0; - for (auto CC : CallSeqMap) { + for (auto CC : CallSeqVector) { // Call sites where no parameters are passed on the stack // do not affect the cost, since there needs to be no // stack adjustment. - if (CC.second.NoStackParams) + if (CC.NoStackParams) continue; - if (!CC.second.UsePush) { + if (!CC.UsePush) { // If we don't use pushes for a particular call site, // we pay for not having a reserved call frame with an // additional sub/add esp pair. The cost is ~3 bytes per instruction, @@ -200,11 +207,11 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, // We'll need a add after the call. Advantage -= 3; // If we have to realign the stack, we'll also need and sub before - if (CC.second.ExpectedDist % StackAlign) + if (CC.ExpectedDist % StackAlign) Advantage -= 3; // Now, for each push, we save ~3 bytes. For small constants, we actually, // save more (up to 5 bytes), but 3 should be a good approximation. - Advantage += (CC.second.ExpectedDist / 4) * 3; + Advantage += (CC.ExpectedDist / 4) * 3; } } @@ -212,8 +219,9 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, } bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { - TII = MF.getSubtarget().getInstrInfo(); - TFL = MF.getSubtarget().getFrameLowering(); + STI = &MF.getSubtarget<X86Subtarget>(); + TII = STI->getInstrInfo(); + TFL = STI->getFrameLowering(); MRI = &MF.getRegInfo(); if (!isLegal(MF)) @@ -223,21 +231,22 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - ContextMap CallSeqMap; + ContextVector CallSeqVector; for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) if (I->getOpcode() == FrameSetupOpcode) { - CallContext &Context = CallSeqMap[I]; + CallContext Context; collectCallInfo(MF, *BB, I, Context); + CallSeqVector.push_back(Context); } - if (!isProfitable(MF, CallSeqMap)) + if (!isProfitable(MF, CallSeqVector)) return false; - for (auto CC : CallSeqMap) - if (CC.second.UsePush) - Changed |= adjustCallSequence(MF, CC.first, CC.second); + for (auto CC : CallSeqVector) + if (CC.UsePush) + Changed |= adjustCallSequence(MF, CC); return Changed; } @@ -307,13 +316,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, // Check that this particular call sequence is amenable to the // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - unsigned StackPtr = RegInfo.getStackRegister(); + STI->getRegisterInfo()); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; + Context.FrameSetup = FrameSetup; // How much do we adjust the stack? This puts an upper bound on // the number of parameters actually passed on it. @@ -338,7 +347,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (!I->isCopy() || !I->getOperand(0).isReg()) return; Context.SPCopy = I++; - StackPtr = Context.SPCopy->getOperand(0).getReg(); + + unsigned StackPtr = Context.SPCopy->getOperand(0).getReg(); // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of MOV32mi or MOV32mr @@ -434,22 +444,22 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, } bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, - MachineBasicBlock::iterator I, const CallContext &Context) { // Ok, we can in fact do the transformation for this call. // Do not remove the FrameSetup instruction, but adjust the parameters. // PEI will end up finalizing the handling of this. - MachineBasicBlock::iterator FrameSetup = I; - MachineBasicBlock &MBB = *(I->getParent()); + MachineBasicBlock::iterator FrameSetup = Context.FrameSetup; + MachineBasicBlock &MBB = *(FrameSetup->getParent()); FrameSetup->getOperand(1).setImm(Context.ExpectedDist); - DebugLoc DL = I->getDebugLoc(); + DebugLoc DL = FrameSetup->getDebugLoc(); // Now, iterate through the vector in reverse order, and replace the movs // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) { MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + MachineBasicBlock::iterator Push = nullptr; if (MOV->getOpcode() == X86::MOV32mi) { unsigned PushOpcode = X86::PUSHi32; // If the operand is a small (8-bit) immediate, we can use a @@ -461,21 +471,20 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, if (isInt<8>(Val)) PushOpcode = X86::PUSH32i8; } - BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) + .addOperand(PushOp); } else { unsigned int Reg = PushOp.getReg(); // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. - const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); + bool SlowPUSHrmm = STI->isAtom() || STI->isSLM(); // Check that this is legal to fold. Right now, we're extremely // conservative about that. MachineInstr *DefMov = nullptr; if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { - MachineInstr *Push = - BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) @@ -483,12 +492,19 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, DefMov->eraseFromParent(); } else { - BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) .addReg(Reg) .getInstr(); } } + // For debugging, when using SP-based CFA, we need to adjust the CFA + // offset after each push. + // TODO: This is needed only if we require precise CFA. + if (!TFL->hasFP(MF)) + TFL->BuildCFI(MBB, std::next(Push), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, 4)); + MBB.erase(MOV); } @@ -532,13 +548,10 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( DefMI->getParent() != FrameSetup->getParent()) return nullptr; - // Now, make sure everything else up until the ADJCALLSTACK is a sequence - // of MOVs. To be less conservative would require duplicating a lot of the - // logic from PeepholeOptimizer. - // FIXME: A possibly better approach would be to teach the PeepholeOptimizer - // to be smarter about folding into pushes. + // Make sure we don't have any instructions between DefMI and the + // push that make folding the load illegal. for (auto I = DefMI; I != FrameSetup; ++I) - if (I->getOpcode() != X86::MOV32rm) + if (I->isLoadFoldBarrier()) return nullptr; return DefMI; diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h index 0eb2494f1d63..a08160f9feba 100644 --- a/lib/Target/X86/X86CallingConv.h +++ b/lib/Target/X86/X86CallingConv.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H #define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H +#include "MCTargetDesc/X86MCTargetDesc.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" @@ -42,6 +43,64 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } +inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure + // not to split i64 and double between a register and stack + static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; + static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // If this is the first part of an double/i64/i128, or if we're already + // in the middle of a split, add to the pending list. If this is not + // the end of the split, return, otherwise go on to process the pending + // list + if (ArgFlags.isSplit() || !PendingMembers.empty()) { + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + if (!ArgFlags.isSplitEnd()) + return true; + } + + // If there are no pending members, we are not in the middle of a split, + // so do the usual inreg stuff. + if (PendingMembers.empty()) { + if (unsigned Reg = State.AllocateReg(RegList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; + } + + assert(ArgFlags.isSplitEnd()); + + // We now have the entire original argument in PendingMembers, so decide + // whether to use registers or the stack. + // Per the MCU ABI: + // a) To use registers, we need to have enough of them free to contain + // the entire argument. + // b) We never want to use more than 2 registers for a single argument. + + unsigned FirstFree = State.getFirstUnallocated(RegList); + bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); + + for (auto &It : PendingMembers) { + if (UseRegs) + It.convertToReg(State.AllocateReg(RegList[FirstFree++])); + else + It.convertToMem(State.AllocateStack(4, 4)); + State.addLoc(It); + } + + PendingMembers.clear(); + + return true; +} + } // End llvm namespace #endif diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 8f88888f5ce3..54d88cbb244e 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -158,6 +158,7 @@ def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, // MMX vector types are always returned in XMM0. CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, @@ -202,6 +203,16 @@ def RetCC_X86_64_AnyReg : CallingConv<[ CCCustom<"CC_X86_AnyReg_Error"> ]>; +// X86-64 HHVM return-value convention. +def RetCC_X86_64_HHVM: CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: could return in any GP register save RSP and R12. + CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14, R15]>> +]>; + // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. @@ -227,6 +238,9 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + // Handle HHVM calls. + CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -280,7 +294,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v64i1], CCPromoteToType<v64i8>>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -305,7 +319,7 @@ def CC_X86_64_C : CallingConv<[ // Long doubles get stack slots whose size and alignment depends on the // subtarget. - CCIfType<[f80], CCAssignToStack<0, 0>>, + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, @@ -319,6 +333,23 @@ def CC_X86_64_C : CallingConv<[ CCAssignToStack<64, 64>> ]>; +// Calling convention for X86-64 HHVM. +def CC_X86_64_HHVM : CallingConv<[ + // Use all/any GP registers for args, except RSP. + CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15, + RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14]>> +]>; + +// Calling convention for helper functions in HHVM. +def CC_X86_64_HHVM_C : CallingConv<[ + // Pass the first argument in RBP. + CCIfType<[i64], CCAssignToReg<[RBP]>>, + + // Otherwise it's the same as the regular C calling convention. + CCDelegateTo<CC_X86_64_C> +]>; + // Calling convention used on Win64 def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. @@ -561,6 +592,23 @@ def CC_X86_32_C : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +def CC_X86_32_MCU : CallingConv<[ + // Handles byval parameters. Note that, like FastCC, we can't rely on + // the delegation to CC_X86_32_Common because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // If the call is not a vararg call, some arguments may be passed + // in integer registers. + CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + def CC_X86_32_FastCall : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -708,18 +756,28 @@ def CC_Intel_OCL_BI : CallingConv<[ CCDelegateTo<CC_X86_32_C> ]>; +def CC_X86_32_Intr : CallingConv<[ + CCAssignToStack<4, 4> +]>; + +def CC_X86_64_Intr : CallingConv<[ + CCAssignToStack<8, 8> +]>; + //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ + CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, // Otherwise, drop to normal X86-32 CC CCDelegateTo<CC_X86_32_C> @@ -734,6 +792,9 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, + CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, + CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -764,6 +825,12 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "XMM%u", 6, 15))>; +// The function used by Darwin to obtain the address of a thread-local variable +// uses rdi to pass a single parameter and rax for the return value. All other +// GPRs are preserved. +def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, + R8, R9, R10, R11)>; + // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, R8, R9, R10, RSP)>; @@ -778,6 +845,11 @@ def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15, RBP, (sequence "XMM%u", 0, 15))>; +def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI, + EDI, ESP)>; +def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "XMM%u", 0, 7))>; + def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP, (sequence "XMM%u", 16, 31))>; def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP, @@ -804,3 +876,6 @@ def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, (sequence "ZMM%u", 16, 31), K4, K5, K6, K7)>; + +// Only R12 is preserved for PHP calls in HHVM. +def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm deleted file mode 100644 index 69b4c71651d7..000000000000 --- a/lib/Target/X86/X86CompilationCallback_Win64.asm +++ /dev/null @@ -1,68 +0,0 @@ -;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---=== -;; -;; The LLVM Compiler Infrastructure -;; -;; This file is distributed under the University of Illinois Open Source -;; License. See LICENSE.TXT for details. -;; -;;===----------------------------------------------------------------------=== -;; -;; This file implements the JIT interfaces for the X86 target. -;; -;;===----------------------------------------------------------------------=== - -extrn LLVMX86CompilationCallback2: PROC - -.code -X86CompilationCallback proc - push rbp - - ; Save RSP. - mov rbp, rsp - - ; Save all int arg registers - ; WARNING: We cannot use register spill area - we're generating stubs by hands! - push rcx - push rdx - push r8 - push r9 - - ; Align stack on 16-byte boundary. - and rsp, -16 - - ; Save all XMM arg registers. Also allocate reg spill area. - sub rsp, 96 - movaps [rsp +32], xmm0 - movaps [rsp+16+32], xmm1 - movaps [rsp+32+32], xmm2 - movaps [rsp+48+32], xmm3 - - ; JIT callee - - ; Pass prev frame and return address. - mov rcx, rbp - mov rdx, qword ptr [rbp+8] - call LLVMX86CompilationCallback2 - - ; Restore all XMM arg registers. - movaps xmm3, [rsp+48+32] - movaps xmm2, [rsp+32+32] - movaps xmm1, [rsp+16+32] - movaps xmm0, [rsp +32] - - ; Restore RSP. - mov rsp, rbp - - ; Restore all int arg registers - sub rsp, 32 - pop r9 - pop r8 - pop rdx - pop rcx - - ; Restore RBP. - pop rbp - ret -X86CompilationCallback endp - -End diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index 6a5a28e546f2..a09d06519376 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -19,9 +19,10 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" -#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. #include "llvm/IR/GlobalValue.h" using namespace llvm; @@ -141,6 +142,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // The EH_RETURN pseudo is really removed during the MC Lowering. return true; } + case X86::IRET: { + // Adjust stack to erase error code + int64_t StackAdj = MBBI->getOperand(0).getImm(); + X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true); + // Replace pseudo with machine iret + BuildMI(MBB, MBBI, DL, + TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32)); + MBB.erase(MBBI); + return true; + } + case X86::EH_RESTORE: { + // Restore ESP and EBP, and optionally ESI if required. + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality( + MBB.getParent()->getFunction()->getPersonalityFn())); + X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH); + MBBI->eraseFromParent(); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index b4319c8bb04f..de94a138d865 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -298,8 +298,8 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, return false; // Make sure nothing is in the way - BasicBlock::const_iterator Start = I; - BasicBlock::const_iterator End = II; + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. @@ -433,6 +433,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasSSE4A = Subtarget->hasSSE4A(); + bool HasAVX = Subtarget->hasAVX(); + bool IsNonTemporal = MMO && MMO->isNonTemporal(); + // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -449,35 +454,59 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; - case MVT::i32: Opc = X86::MOV32mr; break; - case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. + case MVT::i32: + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; + break; case MVT::f32: - Opc = X86ScalarSSEf32 ? - (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSS; + else + Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + } else + Opc = X86::ST_Fp32m; break; case MVT::f64: - Opc = X86ScalarSSEf64 ? - (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSD; + else + Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + } else + Opc = X86::ST_Fp64m; break; case MVT::v4f32: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; - else - Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + else + Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + } else + Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; - else - Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + else + Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + } else + Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; - else + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + else + Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + } else Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } @@ -1069,12 +1098,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. We saved the argument into - // a virtual register in the entry block, so now we copy the value out - // and into %rax. We also do the same with %eax for Win32. - if (F.hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { + // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax. + if (F.hasStructRetAttr()) { unsigned Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); @@ -1431,17 +1459,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addMBB(TrueMBB); } - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); - - // Emits an unconditional branch to the FalseBB, obtains the branch - // weight, and adds it to the successor list. - fastEmitBranch(FalseMBB, DbgLoc); - + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1472,12 +1490,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } @@ -1492,12 +1506,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1511,12 +1520,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1945,6 +1949,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned ResultReg; if (Subtarget->hasAVX()) { + const TargetRegisterClass *FR32 = &X86::FR32RegClass; + const TargetRegisterClass *VR128 = &X86::VR128RegClass; + // If we have AVX, create 1 blendv instead of 3 logic instructions. // Blendv was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many @@ -1955,10 +1962,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned BlendOpcode = (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, - LHSReg, LHSIsKill, CmpReg, true); + unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); @@ -2806,10 +2816,12 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, if (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE) return 0; - if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) - return 0; - if (CS && CS->paramHasAttr(1, Attribute::InReg)) - return 0; + + if (CS) + if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) || + CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU()) + return 0; + return 4; } @@ -2924,7 +2936,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); @@ -3020,8 +3032,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, - ArgVT.getStoreSize(), Alignment); + MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; SrcAM.Base.Reg = ArgReg; @@ -3252,6 +3264,30 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { updateValueMap(I, Reg); return true; } + case Instruction::BitCast: { + // Select SSE2/AVX bitcasts between 128/256 bit vector types. + if (!Subtarget->hasSSE2()) + return false; + + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); + + if (!SrcVT.isSimple() || !DstVT.isSimple()) + return false; + + if (!SrcVT.is128BitVector() && + !(Subtarget->hasAVX() && SrcVT.is256BitVector())) + return false; + + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) + return false; + + // No instruction is needed for conversion. Reuse the register used by + // the fist operand. + updateValueMap(I, Reg); + return true; + } } return false; @@ -3384,8 +3420,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - DL.getPointerSize(), Align); + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, DL.getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 5eb4faeedff4..1dd69e8a6a5f 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -9,6 +9,7 @@ // // This file defines the pass that finds instructions that can be // re-written as LEA instructions in order to reduce pipeline delays. +// When optimizing for size it replaces suitable LEAs with INC or DEC. // //===----------------------------------------------------------------------===// @@ -61,6 +62,11 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg + /// and convert them to INC or DEC respectively. + bool fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const; + /// \brief Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); @@ -89,6 +95,8 @@ public: private: MachineFunction *MF; const X86InstrInfo *TII; // Machine instruction info. + bool OptIncDec; + bool OptLEA; }; char FixupLEAPass::ID = 0; } @@ -150,7 +158,10 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); - if (!ST.LEAusesAG() && !ST.slowLEA()) + OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); + OptLEA = ST.LEAusesAG() || ST.slowLEA(); + + if (!OptLEA && !OptIncDec) return false; TII = ST.getInstrInfo(); @@ -187,7 +198,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { if (I == MFI->begin()) { - if (MFI->isPredecessor(MFI)) { + if (MFI->isPredecessor(&*MFI)) { I = --MFI->end(); return true; } else @@ -222,6 +233,60 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return nullptr; } +static inline bool isLEA(const int opcode) { + return opcode == X86::LEA16r || opcode == X86::LEA32r || + opcode == X86::LEA64r || opcode == X86::LEA64_32r; +} + +/// isLEASimpleIncOrDec - Does this LEA have one these forms: +/// lea %reg, 1(%reg) +/// lea %reg, -1(%reg) +static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) { + unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg(); + unsigned DstReg = LEA->getOperand(0).getReg(); + unsigned AddrDispOp = 1 + X86::AddrDisp; + return SrcReg == DstReg && + LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 && + LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 && + LEA->getOperand(AddrDispOp).isImm() && + (LEA->getOperand(AddrDispOp).getImm() == 1 || + LEA->getOperand(AddrDispOp).getImm() == -1); +} + +bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const { + MachineInstr *MI = I; + int Opcode = MI->getOpcode(); + if (!isLEA(Opcode)) + return false; + + if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) { + int NewOpcode; + bool isINC = MI->getOperand(4).getImm() == 1; + switch (Opcode) { + case X86::LEA16r: + NewOpcode = isINC ? X86::INC16r : X86::DEC16r; + break; + case X86::LEA32r: + case X86::LEA64_32r: + NewOpcode = isINC ? X86::INC32r : X86::DEC32r; + break; + case X86::LEA64r: + NewOpcode = isINC ? X86::INC64r : X86::DEC64r; + break; + } + + MachineInstr *NewMI = + BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)); + MFI->erase(I); + I = static_cast<MachineBasicBlock::iterator>(NewMI); + return true; + } + return false; +} + void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { // Process a load, store, or LEA instruction. @@ -265,8 +330,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr *MI = I; const int opcode = MI->getOpcode(); - if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r && - opcode != X86::LEA64_32r) + if (!isLEA(opcode)) return; if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -280,7 +344,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; int addrr_opcode, addri_opcode; switch (opcode) { - default: llvm_unreachable("Unexpected LEA instruction"); + default: + llvm_unreachable("Unexpected LEA instruction"); case X86::LEA16r: addrr_opcode = X86::ADD16rr; addri_opcode = X86::ADD16ri; @@ -330,10 +395,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI) { for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (MF.getSubtarget<X86Subtarget>().isSLM()) - processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + if (OptIncDec) + if (fixupIncDec(I, MFI)) + continue; + + if (OptLEA) { + if (MF.getSubtarget<X86Subtarget>().isSLM()) + processInstructionForSLM(I, MFI); + else + processInstruction(I, MFI); + } } return false; } diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 40b9c8a863a3..97bb8ab653a6 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -120,12 +120,10 @@ namespace { // Return a bitmask of FP registers in block's live-in list. static unsigned calcLiveInMask(MachineBasicBlock *MBB) { unsigned Mask = 0; - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), - E = MBB->livein_end(); I != E; ++I) { - unsigned Reg = *I; - if (Reg < X86::FP0 || Reg > X86::FP6) + for (const auto &LI : MBB->liveins()) { + if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) continue; - Mask |= 1 << (Reg - X86::FP0); + Mask |= 1 << (LI.PhysReg - X86::FP0); } return Mask; } @@ -301,8 +299,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { bool FPIsUsed = false; static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0; i <= 6; ++i) - if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { + if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { FPIsUsed = true; break; } @@ -321,7 +320,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process the function in depth first order so that we process at least one // of the predecessors for every reachable block in the function. SmallPtrSet<MachineBasicBlock*, 8> Processed; - MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock *Entry = &MF.front(); bool Changed = false; for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) @@ -329,9 +328,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process any unreachable blocks in arbitrary order now. if (MF.size() != Processed.size()) - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - if (Processed.insert(BB).second) - Changed |= processBasicBlock(MF, *BB); + for (MachineBasicBlock &BB : MF) + if (Processed.insert(&BB).second) + Changed |= processBasicBlock(MF, BB); LiveBundles.clear(); @@ -348,13 +347,12 @@ void FPS::bundleCFG(MachineFunction &MF) { LiveBundles.resize(Bundles->getNumBundles()); // Gather the actual live-in masks for all MBBs. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = I; - const unsigned Mask = calcLiveInMask(MBB); + for (MachineBasicBlock &MBB : MF) { + const unsigned Mask = calcLiveInMask(&MBB); if (!Mask) continue; // Update MBB ingoing bundle mask. - LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask; + LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask; } } @@ -546,17 +544,9 @@ namespace { }; } -#ifndef NDEBUG -static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { - for (unsigned i = 0; i != NumEntries-1; ++i) - if (!(Table[i] < Table[i+1])) return false; - return true; -} -#endif - -static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { - const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); - if (I != Table+N && I->from == Opcode) +static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode); + if (I != Table.end() && I->from == Opcode) return I->to; return -1; } @@ -567,7 +557,7 @@ static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { #define ASSERT_SORTED(TABLE) \ { static bool TABLE##Checked = false; \ if (!TABLE##Checked) { \ - assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \ + assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \ "All lookup tables must be sorted for efficient access!"); \ TABLE##Checked = true; \ } \ @@ -746,7 +736,7 @@ static const TableEntry OpcodeTable[] = { static unsigned getConcreteOpcode(unsigned Opcode) { ASSERT_SORTED(OpcodeTable); - int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode); + int Opc = Lookup(OpcodeTable, Opcode); assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); return Opc; } @@ -797,7 +787,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { RegMap[Stack[--StackTop]] = ~0; // Update state // Check to see if there is a popping version of this instruction... - int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode()); + int Opcode = Lookup(PopTable, I->getOpcode()); if (Opcode != -1) { I->setDesc(TII->get(Opcode)); if (Opcode == X86::UCOM_FPPr) @@ -1193,7 +1183,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { // We decide which form to use based on what is on the top of the stack, and // which operand is killed by this instruction. - const TableEntry *InstTable; + ArrayRef<TableEntry> InstTable; bool isForward = TOS == Op0; bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); if (updateST0) { @@ -1208,8 +1198,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { InstTable = ReverseSTiTable; } - int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table), - MI->getOpcode()); + int Opcode = Lookup(InstTable, MI->getOpcode()); assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); // NotTOS - The register which is not on the top of stack... @@ -1520,31 +1509,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { return; } - case X86::WIN_FTOL_32: - case X86::WIN_FTOL_64: { - // Push the operand into ST0. - MachineOperand &Op = MI->getOperand(0); - assert(Op.isUse() && Op.isReg() && - Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); - unsigned FPReg = getFPReg(Op); - if (Op.isKill()) - moveToTop(FPReg, Inst); - else - duplicateToTop(FPReg, ScratchFPReg, Inst); - - // Emit the call. This will pop the operand. - BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) - .addExternalSymbol("_ftol2") - .addReg(X86::ST0, RegState::ImplicitKill) - .addReg(X86::ECX, RegState::ImplicitDefine) - .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::EDX, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - --StackTop; - - break; - } - case X86::RETQ: case X86::RETL: case X86::RETIL: diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 3a21b57f0157..242d0333ef9a 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -18,25 +18,23 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" #include <cstdlib> using namespace llvm; -// FIXME: completely move here. -extern cl::opt<bool> ForceStackAlign; - X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride) : TargetFrameLowering(StackGrowsDown, StackAlignOverride, @@ -80,6 +78,27 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } +/// usesTheStack - This function checks if any of the users of EFLAGS +/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has +/// to use the stack, and if we don't adjust the stack we clobber the first +/// frame index. +/// See X86InstrInfo::copyPhysReg. +static bool usesTheStack(const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Conservativley assume that inline assembly might use the stack. + if (MF.hasInlineAsm()) + return true; + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + +static bool doesStackUseImplyFP(const MachineFunction &MF) { + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + return IsWin64Prologue && usesTheStack(MF); +} + /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. @@ -92,8 +111,9 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.callsEHReturn() || - MFI->hasStackMap() || MFI->hasPatchPoint()); + MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + doesStackUseImplyFP(MF)); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -148,21 +168,14 @@ static unsigned getLEArOpcode(unsigned IsLP64) { /// to this register without worry about clobbering it. static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const TargetRegisterInfo *TRI, + const X86RegisterInfo *TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); if (!F || MF->getMMI().callsEHReturn()) return 0; - static const uint16_t CallerSavedRegs32Bit[] = { - X86::EAX, X86::EDX, X86::ECX, 0 - }; - - static const uint16_t CallerSavedRegs64Bit[] = { - X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, - X86::R8, X86::R9, X86::R10, X86::R11, 0 - }; + const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); unsigned Opc = MBBI->getOpcode(); switch (Opc) { @@ -191,10 +204,9 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, Uses.insert(*AI); } - const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; - for (; *CS; ++CS) - if (!Uses.count(*CS)) - return *CS; + for (auto CS : AvailableRegs) + if (!Uses.count(CS) && CS != X86::RIP) + return CS; } } @@ -214,8 +226,12 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -/// Check whether or not the terminators of \p MBB needs to read EFLAGS. -static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { +/// Check if the flags need to be preserved before the terminators. +/// This would be the case, if the eflags is live-in of the region +/// composed by the terminators or live-out of that region, without +/// being defined by a terminator. +static bool +flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB.terminators()) { bool BreakNext = false; for (const MachineOperand &MO : MI.operands()) { @@ -225,15 +241,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { if (Reg != X86::EFLAGS) continue; - // This terminator needs an eflag that is not defined - // by a previous terminator. + // This terminator needs an eflags that is not defined + // by a previous another terminator: + // EFLAGS is live-in of the region composed by the terminators. if (!MO.isDef()) return true; + // This terminator defines the eflags, i.e., we don't need to preserve it. + // However, we still need to check this specific terminator does not + // read a live-in value. BreakNext = true; } + // We found a definition of the eflags, no need to preserve them. if (BreakNext) - break; + return false; } + + // None of the terminators use or define the eflags. + // Check if they are live-out, that would imply we need to preserve them. + for (const MachineBasicBlock *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + return true; + return false; } @@ -289,6 +317,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) MI->setFlag(MachineInstr::FrameSetup); + else + MI->setFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; continue; } @@ -298,6 +328,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); if (isSub) MI.setMIFlag(MachineInstr::FrameSetup); + else + MI.setMIFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; } @@ -312,7 +344,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // is tricky. bool UseLEA; if (!InEpilogue) { - UseLEA = STI.useLeaForSP(); + // Check if inserting the prologue at the beginning + // of MBB would require to use LEA operations. + // We need to use LEA operations if EFLAGS is live in, because + // it means an instruction will read it before it gets defined. + UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); } else { // If we can use LEA for SP but we shouldn't, check that none // of the terminators uses the eflags. Otherwise we will insert @@ -321,10 +357,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // and is an optimization anyway. UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); if (UseLEA && !STI.useLeaForSP()) - UseLEA = terminatorsNeedFlagsAsInput(MBB); + UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); // If that assert breaks, that means we do not do the right thing // in canUseAsEpilogue. - assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) && + assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && "We shouldn't have allowed this insertion point"); } @@ -347,30 +383,6 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( return MI; } -/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. -static -void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = nullptr) { - if (MBBI == MBB.begin()) return; - - MachineBasicBlock::iterator PI = std::prev(MBBI); - unsigned Opc = PI->getOpcode(); - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || - Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes += PI->getOperand(2).getImm(); - MBB.erase(PI); - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && - PI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes -= PI->getOperand(2).getImm(); - MBB.erase(PI); - } -} - int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, bool doMergeWithPrevious) const { @@ -436,27 +448,265 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, } } -/// usesTheStack - This function checks if any of the users of EFLAGS -/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has -/// to use the stack, and if we don't adjust the stack we clobber the first -/// frame index. -/// See X86InstrInfo::copyPhysReg. -static bool usesTheStack(const MachineFunction &MF) { - const MachineRegisterInfo &MRI = MF.getRegInfo(); +MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + if (STI.isTargetWindowsCoreCLR()) { + if (InProlog) { + return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + } else { + return emitStackProbeInline(MF, MBB, MBBI, DL, false); + } + } else { + return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + } +} - for (MachineRegisterInfo::reg_instr_iterator - ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end(); - ri != re; ++ri) - if (ri->isCopy()) - return true; +void X86FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + const StringRef ChkStkStubSymbol = "__chkstk_stub"; + MachineInstr *ChkStkStub = nullptr; - return false; + for (MachineInstr &MI : PrologMBB) { + if (MI.isCall() && MI.getOperand(0).isSymbol() && + ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { + ChkStkStub = &MI; + break; + } + } + + if (ChkStkStub != nullptr) { + MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); + assert(std::prev(MBBI).operator==(ChkStkStub) && + "MBBI expected after __chkstk_stub."); + DebugLoc DL = PrologMBB.findDebugLoc(MBBI); + emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); + ChkStkStub->eraseFromParent(); + } } -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) const { +MachineInstr *X86FrameLowering::emitStackProbeInline( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + assert(STI.is64Bit() && "different expansion needed for 32 bit"); + assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + // + // MBB: + // SizeReg = RAX; + // ZeroReg = 0 + // CopyReg = RSP + // Flags, TestReg = CopyReg - SizeReg + // FinalReg = !Flags.Ovf ? TestReg : ZeroReg + // LimitReg = gs magic thread env access + // if FinalReg >= LimitReg goto ContinueMBB + // RoundBB: + // RoundReg = page address of FinalReg + // LoopMBB: + // LoopReg = PHI(LimitReg,ProbeReg) + // ProbeReg = LoopReg - PageSize + // [ProbeReg] = 0 + // if (ProbeReg > RoundReg) goto LoopMBB + // ContinueMBB: + // RSP = RSP - RAX + // [rest of original MBB] + + // Set up the new basic blocks + MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); + MF.insert(MBBIter, RoundMBB); + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + // Some useful constants + const int64_t ThreadEnvironmentStackLimit = 0x10; + const int64_t PageSize = 0x1000; + const int64_t PageMask = ~(PageSize - 1); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = &X86::GR64RegClass; + const unsigned SizeReg = InProlog ? (unsigned)X86::RAX + : MRI.createVirtualRegister(RegClass), + ZeroReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + CopyReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + TestReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + FinalReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + RoundedReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + LimitReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + JoinReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + ProbeReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass); + + // SP-relative offsets where we can save RCX and RDX. + int64_t RCXShadowSlot = 0; + int64_t RDXShadowSlot = 0; + + // If inlining in the prolog, save RCX and RDX. + // Future optimization: don't save or restore if not live in. + if (InProlog) { + // Compute the offsets. We need to account for things already + // pushed onto the stack at this point: return address, frame + // pointer (if used), and callee saves. + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); + const bool HasFP = hasFP(MF); + RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + RDXShadowSlot = RCXShadowSlot + 8; + // Emit the saves. + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); + } else { + // Not in the prolog. Copy RAX to a virtual reg. + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); + } + + // Add code to MBB to check for overflow and set the new target stack pointer + // to zero if so. + BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) + .addReg(ZeroReg, RegState::Undef) + .addReg(ZeroReg, RegState::Undef); + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); + BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) + .addReg(CopyReg) + .addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + .addReg(TestReg) + .addReg(ZeroReg); + + // FinalReg now holds final stack pointer value, or zero if + // allocation would overflow. Compare against the current stack + // limit from the thread environment block. Note this limit is the + // lowest touched page on the stack, not the point at which the OS + // will cause an overflow exception, so this is just an optimization + // to avoid unnecessarily touching pages that are below the current + // SP but already commited to the stack by the OS. + BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(ThreadEnvironmentStackLimit) + .addReg(X86::GS); + BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); + // Jump if the desired stack pointer is at or above the stack limit. + BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + + // Add code to roundMBB to round the final stack pointer to a page boundary. + BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) + .addReg(FinalReg) + .addImm(PageMask); + BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); + + // LimitReg now holds the current stack limit, RoundedReg page-rounded + // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page + // and probe until we reach RoundedReg. + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) + .addReg(LimitReg) + .addMBB(RoundMBB) + .addReg(ProbeReg) + .addMBB(LoopMBB); + } + + addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, + false, -PageSize); + + // Probe by storing a byte onto the stack. + BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) + .addReg(ProbeReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) + .addReg(RoundedReg) + .addReg(ProbeReg); + BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // If in prolog, restore RDX and RCX. + if (InProlog) { + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RCX), + X86::RSP, false, RCXShadowSlot); + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RDX), + X86::RSP, false, RDXShadowSlot); + } + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(ContinueMBB); + MBB.addSuccessor(RoundMBB); + RoundMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *RoundMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *LoopMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } + } + + // Possible TODO: physreg liveness for InProlog case. + + return ContinueMBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; @@ -478,6 +728,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, Symbol = "_chkstk"; MachineInstrBuilder CI; + MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); // All current stack probes take AX and SP as input, clobber flags, and // preserve all registers. x86_64 probes leave RSP unmodified. @@ -507,6 +758,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addReg(X86::RSP) .addReg(X86::RAX); } + + if (InProlog) { + // Apply the frame setup flag to all inserted instrs. + for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) + ExpansionMBBI->setFlag(MachineInstr::FrameSetup); + } + + return MBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + + assert(InProlog && "ChkStkStub called outside prolog!"); + + BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__chkstk_stub"); + + return MBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -526,7 +797,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con const MachineFrameInfo *MFI = MF.getFrameInfo(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. unsigned StackAlign = getStackAlignment(); - if (ForceStackAlign) { + if (MF.getFunction()->hasFnAttribute("stackrealign")) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) @@ -537,15 +808,14 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, + DebugLoc DL, unsigned Reg, uint64_t MaxAlign) const { uint64_t Val = -MaxAlign; - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), - StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); + unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) + .addReg(Reg) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); @@ -646,6 +916,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool IsFunclet = MBB.isEHFuncletEntry(); + EHPersonality Personality = EHPersonality::Unknown; + if (Fn->hasPersonalityFn()) + Personality = classifyEHPersonality(Fn->getPersonalityFn()); + bool FnHasClrFunclet = + MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR; + bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); @@ -655,9 +932,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() - ? getX86SubSuperRegister(FramePtr, MVT::i64, false) - : FramePtr; + ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. DebugLoc DL; // Add RETADDR move area to callee saved frame size. @@ -723,6 +1002,24 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, uint64_t NumBytes = 0; int stackGrowth = -SlotSize; + // Find the funclet establisher parameter + unsigned Establisher = X86::NoRegister; + if (IsClrFunclet) + Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; + else if (IsFunclet) + Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX; + + if (IsWin64Prologue && IsFunclet && !IsClrFunclet) { + // Immediately spill establisher into the home slot. + // The runtime cares about this. + // MOV64mr %rdx, 16(%rsp) + unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16) + .addReg(Establisher) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(Establisher); + } + if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; @@ -739,7 +1036,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. - MFI->setOffsetAdjustment(-NumBytes); + if (!IsFunclet) + MFI->setOffsetAdjustment(-NumBytes); + else + assert(MFI->getOffsetAdjustment() == -(int)NumBytes && + "should calculate same local variable offset for funclets"); // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) @@ -765,35 +1066,46 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if (!IsWin64Prologue) { + if (!IsWin64Prologue && !IsFunclet) { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); - } - if (NeedsDwarfCFI) { - // Mark effective beginning of when frame pointer becomes valid. - // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); + if (NeedsDwarfCFI) { + // Mark effective beginning of when frame pointer becomes valid. + // Define the current CFA to use the EBP/RBP register. + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( + nullptr, DwarfFramePtr)); + } } - // Mark the FramePtr as live-in in every block. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - I->addLiveIn(MachineFramePtr); + // Mark the FramePtr as live-in in every block. Don't do this again for + // funclet prologues. + if (!IsFunclet) { + for (MachineBasicBlock &EveryMBB : MF) + EveryMBB.addLiveIn(MachineFramePtr); + } } else { + assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } + // For EH funclets, only allocate enough space for outgoing calls. Save the + // NumBytes value that we would've used for the parent frame. + unsigned ParentFrameNumBytes = NumBytes; + if (IsFunclet) + NumBytes = getWinEHFuncletFrameSize(MF); + // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; while (MBBI != MBB.end() && + MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; @@ -818,9 +1130,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Don't do this for Win64, it needs to realign the stack after the prologue. - if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) { + if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge @@ -839,7 +1151,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. uint64_t AlignedNumBytes = NumBytes; - if (IsWin64Prologue && TRI->needsStackRealignment(MF)) + if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. @@ -876,26 +1188,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); } - // Save a pointer to the MI where we set AX. - MachineBasicBlock::iterator SetRAX = MBBI; - --SetRAX; - // Call __chkstk, __chkstk_ms, or __alloca. - emitStackProbeCall(MF, MBB, MBBI, DL); - - // Apply the frame setup flag to all inserted instrs. - for (; SetRAX != MBBI; ++SetRAX) - SetRAX->setFlag(MachineInstr::FrameSetup); + emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); + MachineInstr *MI = + addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), + StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } @@ -909,19 +1213,72 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); int SEHFrameOffset = 0; + unsigned SPOrEstablisher; + if (IsFunclet) { + if (IsClrFunclet) { + // The establisher parameter passed to a CLR funclet is actually a pointer + // to the (mostly empty) frame of its nearest enclosing funclet; we have + // to find the root function establisher frame by loading the PSPSym from + // the intermediate frame. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + MachinePointerInfo NoInfo; + MBB.addLiveIn(Establisher); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), + Establisher, false, PSPSlotOffset) + .addMemOperand(MF.getMachineMemOperand( + NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize)); + ; + // Save the root establisher back into the current funclet's (mostly + // empty) frame, in case a sub-funclet or the GC needs it. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, + false, PSPSlotOffset) + .addReg(Establisher) + .addMemOperand( + MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + SPOrEstablisher = Establisher; + } else { + SPOrEstablisher = StackPtr; + } + if (IsWin64Prologue && HasFP) { - SEHFrameOffset = calculateSetFPREG(NumBytes); + // Set RBP to a small fixed offset from RSP. In the funclet case, we base + // this calculation on the incoming establisher, which holds the value of + // RSP from the parent frame at the end of the prologue. + SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes); if (SEHFrameOffset) addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), - StackPtr, false, SEHFrameOffset); + SPOrEstablisher, false, SEHFrameOffset); else - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr) + .addReg(SPOrEstablisher); - if (NeedsWinCFI) + // If this is not a funclet, emit the CFI describing our frame pointer. + if (NeedsWinCFI && !IsFunclet) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); + if (isAsynchronousEHPersonality(Personality)) + MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset; + } + } else if (IsFunclet && STI.is32Bit()) { + // Reset EBP / ESI to something good for funclets. + MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL); + // If we're a catch funclet, we can be returned to via catchret. Save ESP + // into the registration node so that the runtime will restore it for us. + if (!MBB.isCleanupFuncletEntry()) { + assert(Personality == EHPersonality::MSVC_CXX); + unsigned FrameReg; + int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; + int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg); + // ESP is the first field, so no extra displacement is needed. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, + false, EHRegOffset) + .addReg(X86::ESP); + } } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { @@ -932,7 +1289,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { - int Offset = getFrameIndexOffset(MF, FI); + unsigned IgnoredFrameReg; + int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) @@ -948,14 +1306,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); + if (FnHasClrFunclet && !IsFunclet) { + // Save the so-called Initial-SP (i.e. the value of the stack pointer + // immediately after the prolog) into the PSPSlot so that funclets + // and the GC can recover it. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + auto PSPInfo = MachinePointerInfo::getFixedStack( + MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, + PSPSlotOffset) + .addReg(StackPtr) + .addMemOperand(MF.getMachineMemOperand( + PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + // Realign stack after we spilled callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign); } + // We already dealt with stack realignment and funclets above. + if (IsFunclet && STI.is32Bit()) + return; + // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer @@ -964,7 +1341,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) - .addReg(StackPtr) + .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens @@ -972,18 +1349,21 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) - .addReg(StackPtr) + .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); } - if (X86FI->getHasSEHFramePtrSave()) { + if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { // Stash the value of the frame pointer relative to the base pointer for // Win32 EH. This supports Win32 EH, which does the inverse of the above: // it recovers the frame pointer from the base pointer rather than the // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, - getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex())) + unsigned UsedReg; + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) .addReg(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } @@ -1015,6 +1395,69 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue( return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); } +static bool isFuncletReturnInstr(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + +// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the +// stack. It holds a pointer to the bottom of the root function frame. The +// establisher frame pointer passed to a nested funclet may point to the +// (mostly empty) frame of its parent funclet, but it will need to find +// the frame of the root function to access locals. To facilitate this, +// every funclet copies the pointer to the bottom of the root function +// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the +// same offset for the PSPSym in the root function frame that's used in the +// funclets' frames allows each funclet to dynamically accept any ancestor +// frame as its establisher argument (the runtime doesn't guarantee the +// immediate parent for some reason lost to history), and also allows the GC, +// which uses the PSPSym for some bookkeeping, to find it in any funclet's +// frame with only a single offset reported for the entire method. +unsigned +X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { + const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); + // getFrameIndexReferenceFromSP has an out ref parameter for the stack + // pointer register; pass a dummy that we ignore + unsigned SPReg; + int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg); + assert(Offset >= 0); + return static_cast<unsigned>(Offset); +} + +unsigned +X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + // This is the size of the pushed CSRs. + unsigned CSSize = + MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // This is the amount of stack a funclet needs to allocate. + unsigned UsedSize; + EHPersonality Personality = + classifyEHPersonality(MF.getFunction()->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + // CLR funclets need to hold enough space to include the PSPSym, at the + // same offset from the stack pointer (immediately after the prolog) as it + // resides at in the main function. + UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; + } else { + // Other funclets just need enough stack for outgoing call arguments. + UsedSize = MF.getFrameInfo()->getMaxCallFrameSize(); + } + // RBP is not included in the callee saved register block. After pushing RBP, + // everything is 16 byte aligned. Everything we allocate before an outgoing + // call must also be 16 byte aligned. + unsigned FrameSizeMinusRBP = + RoundUpToAlignment(CSSize + UsedSize, getStackAlignment()); + // Subtract out the size of the callee saved registers. This is how much stack + // each funclet will allocate. + return FrameSizeMinusRBP - CSSize; +} + void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1027,12 +1470,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, const bool Is64BitILP32 = STI.isTarget64BitILP32(); unsigned FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = - Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) - : FramePtr; + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinCFI = IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); + bool IsFunclet = isFuncletReturnInstr(MBBI); + MachineBasicBlock *TargetMBB = nullptr; // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); @@ -1040,7 +1484,27 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - if (hasFP(MF)) { + if (MBBI->getOpcode() == X86::CATCHRET) { + // SEH shouldn't use catchret. + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && + "SEH should not use CATCHRET"); + + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + TargetMBB = MBBI->getOperand(0).getMBB(); + + // Pop EBP. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (MBBI->getOpcode() == X86::CLEANUPRET) { + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; NumBytes = FrameSize - CSSize; @@ -1052,7 +1516,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Pop EBP. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); } else { NumBytes = StackSize - CSSize; } @@ -1063,26 +1528,50 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); - if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && - !PI->isTerminator()) + if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && + (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && + Opc != X86::DBG_VALUE && !PI->isTerminator()) break; --MBBI; } MachineBasicBlock::iterator FirstCSPop = MBBI; + if (TargetMBB) { + // Fill EAX/RAX with the address of the target block. + unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX; + if (STI.is64Bit()) { + // LEA64r TargetMBB(%rip), %rax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(TargetMBB) + .addReg(0); + } else { + // MOV32ri $TargetMBB, %eax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg) + .addMBB(TargetMBB); + } + // Record that we've taken the address of TargetMBB and no longer just + // reference it in a terminator. + TargetMBB->setHasAddressTaken(); + } + if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI->hasVarSizedObjects()) - mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); + NumBytes += mergeSPUpdates(MBB, MBBI, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was - // realigned. - if (TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + // realigned. Don't do this if this was a funclet epilogue, since the funclets + // will not do realignment or dynamic stack allocation. + if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) && + !IsFunclet) { if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); @@ -1134,9 +1623,24 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } -int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +// NOTE: this only has a subset of the full frame index logic. In +// particular, the FI < 0 and AfterFPPop logic is handled in +// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly +// (probably?) it should be moved into here. +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (TRI->hasBasePointer(MF)) + FrameReg = TRI->getBaseRegister(); + else if (TRI->needsStackRealignment(MF)) + FrameReg = TRI->getStackRegister(); + else + FrameReg = TRI->getFrameRegister(MF); + // Offset will hold the offset from the stack pointer at function entry to the // object. // We need to factor in additional offsets applied during the prologue to the @@ -1207,48 +1711,62 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, return Offset + FPDelta; } -int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { - // We can't calculate offset from frame pointer if the stack is realigned, - // so enforce usage of stack/base pointer. The base pointer is used when we - // have dynamic allocas in addition to dynamic realignment. - if (TRI->hasBasePointer(MF)) - FrameReg = TRI->getBaseRegister(); - else if (TRI->needsStackRealignment(MF)) - FrameReg = TRI->getStackRegister(); - else - FrameReg = TRI->getFrameRegister(MF); - return getFrameIndexOffset(MF, FI); -} - -// Simplified from getFrameIndexOffset keeping only StackPointer cases -int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); // Does not include any dynamic realign. const uint64_t StackSize = MFI->getStackSize(); { #ifndef NDEBUG - // Note: LLVM arranges the stack as: - // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) - // > "Stack Slots" (<--SP) - // We can always address StackSlots from RSP. We can usually (unless - // needsStackRealignment) address CSRs from RSP, but sometimes need to - // address them from RBP. FixedObjects can be placed anywhere in the stack - // frame depending on their specific requirements (i.e. we can actually - // refer to arguments to the function which are stored in the *callers* - // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs - // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. - - assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - - // We don't handle tail calls, and shouldn't be seeing them - // either. + // LLVM arranges the stack as follows: + // ... + // ARG2 + // ARG1 + // RETADDR + // PUSH RBP <-- RBP points here + // PUSH CSRs + // ~~~~~~~ <-- possible stack realignment (non-win64) + // ... + // STACK OBJECTS + // ... <-- RSP after prologue points here + // ~~~~~~~ <-- possible stack realignment (win64) + // + // if (hasVarSizedObjects()): + // ... <-- "base pointer" (ESI/RBX) points here + // DYNAMIC ALLOCAS + // ... <-- RSP points here + // + // Case 1: In the simple case of no stack realignment and no dynamic + // allocas, both "fixed" stack objects (arguments and CSRs) are addressable + // with fixed offsets from RSP. + // + // Case 2: In the case of stack realignment with no dynamic allocas, fixed + // stack objects are addressed with RBP and regular stack objects with RSP. + // + // Case 3: In the case of dynamic allocas and stack realignment, RSP is used + // to address stack arguments for outgoing calls and nothing else. The "base + // pointer" points to local variables, and RBP points to fixed objects. + // + // In cases 2 and 3, we can only answer for non-fixed stack objects, and the + // answer we give is relative to the SP after the prologue, and not the + // SP in the middle of the function. + + assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) || + STI.isTargetWin64()) && + "offset from fixed object to SP is not static"); + + // We don't handle tail calls, and shouldn't be seeing them either. int TailCallReturnAddrDelta = MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); #endif } + // Fill in FrameReg output argument. + FrameReg = TRI->getStackRegister(); + // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is @@ -1280,15 +1798,6 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F return Offset + StackSize; } -// Simplified from getFrameIndexReference keeping only StackPointer cases -int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { - assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - - FrameReg = TRI->getStackRegister(); - return getFrameIndexOffsetFromSP(MF, FI); -} bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, @@ -1358,6 +1867,11 @@ bool X86FrameLowering::spillCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); + // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI + // for us, and there are no XMM CSRs on Win32. + if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows()) + return true; + // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { @@ -1399,6 +1913,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; + if (isFuncletReturnInstr(MI) && STI.isOSWindows()) { + // Don't restore CSRs in 32-bit EH funclets. Matches + // spillCalleeSavedRegisters. + if (STI.is32Bit()) + return true; + // Don't restore CSRs before an SEH catchret. SEH except blocks do not form + // funclets. emitEpilogue transforms these to normal jumps. + if (MI->getOpcode() == X86::CATCHRET) { + const Function *Func = MBB.getParent()->getFunction(); + bool IsSEH = isAsynchronousEHPersonality( + classifyEHPersonality(Func->getPersonalityFn())); + if (IsSEH) + return true; + } + } + DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. @@ -1420,7 +1950,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, !X86::GR32RegClass.contains(Reg)) continue; - BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + BuildMI(MBB, MI, DL, TII.get(Opc), Reg) + .setMIFlag(MachineInstr::FrameDestroy); } return true; } @@ -1450,8 +1981,16 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, } // Spill the BasePtr if it's used. - if (TRI->hasBasePointer(MF)) + if (TRI->hasBasePointer(MF)) { SavedRegs.set(TRI->getBaseRegister()); + + // Allocate a spill slot for EBP if we have a base pointer and EH funclets. + if (MF.getMMI().hasEHFunclets()) { + int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setHasSEHFramePtrSave(true); + X86FI->setSEHFramePtrSaveIndex(FI); + } + } } static bool @@ -1545,11 +2084,9 @@ void X86FrameLowering::adjustForSegmentedStacks( // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. - for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), - e = PrologueMBB.livein_end(); - i != e; i++) { - allocMBB->addLiveIn(*i); - checkMBB->addLiveIn(*i); + for (const auto &LI : PrologueMBB.liveins()) { + allocMBB->addLiveIn(LI); + checkMBB->addLiveIn(LI); } if (IsNested) @@ -1682,8 +2219,6 @@ void X86FrameLowering::adjustForSegmentedStacks( .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(Reg10); - MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); @@ -1821,11 +2356,9 @@ void X86FrameLowering::adjustForHiPEPrologue( MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(), - E = PrologueMBB.livein_end(); - I != E; I++) { - stackCheckMBB->addLiveIn(*I); - incStackMBB->addLiveIn(*I); + for (const auto &LI : PrologueMBB.liveins()) { + stackCheckMBB->addLiveIn(LI); + incStackMBB->addLiveIn(LI); } MF.push_front(incStackMBB); @@ -1870,16 +2403,84 @@ void X86FrameLowering::adjustForHiPEPrologue( .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); - stackCheckMBB->addSuccessor(&PrologueMBB, 99); - stackCheckMBB->addSuccessor(incStackMBB, 1); - incStackMBB->addSuccessor(&PrologueMBB, 99); - incStackMBB->addSuccessor(incStackMBB, 1); + stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); + stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); + incStackMBB->addSuccessor(&PrologueMBB, {99, 100}); + incStackMBB->addSuccessor(incStackMBB, {1, 100}); } #ifdef XDEBUG MF.verify(); #endif } +bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const { + + if (Offset <= 0) + return false; + + if (Offset % SlotSize) + return false; + + int NumPops = Offset / SlotSize; + // This is only worth it if we have at most 2 pops. + if (NumPops != 1 && NumPops != 2) + return false; + + // Handle only the trivial case where the adjustment directly follows + // a call. This is the most common one, anyway. + if (MBBI == MBB.begin()) + return false; + MachineBasicBlock::iterator Prev = std::prev(MBBI); + if (!Prev->isCall() || !Prev->getOperand(1).isRegMask()) + return false; + + unsigned Regs[2]; + unsigned FoundRegs = 0; + + auto RegMask = Prev->getOperand(1); + + auto &RegClass = + Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; + // Try to find up to NumPops free registers. + for (auto Candidate : RegClass) { + + // Poor man's liveness: + // Since we're immediately after a call, any register that is clobbered + // by the call and not defined by it can be considered dead. + if (!RegMask.clobbersPhysReg(Candidate)) + continue; + + bool IsDef = false; + for (const MachineOperand &MO : Prev->implicit_operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) { + IsDef = true; + break; + } + } + + if (IsDef) + continue; + + Regs[FoundRegs++] = Candidate; + if (FoundRegs == (unsigned)NumPops) + break; + } + + if (FoundRegs == 0) + return false; + + // If we found only one free register, but need two, reuse the same one twice. + while (FoundRegs < (unsigned)NumPops) + Regs[FoundRegs++] = Regs[0]; + + for (int i = 0; i < NumPops; ++i) + BuildMI(MBB, MBBI, DL, + TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); + + return true; +} + void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -1895,8 +2496,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the // adjcallstackdown instruction into 'add ESP, <amt>' - if (Amount == 0) - return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -1904,15 +2503,68 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned StackAlign = getStackAlignment(); Amount = RoundUpToAlignment(Amount, StackAlign); + MachineModuleInfo &MMI = MF.getMMI(); + const Function *Fn = MF.getFunction(); + bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool DwarfCFI = !WindowsCFI && + (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + + // If we have any exception handlers in this function, and we adjust + // the SP before calls, we may need to indicate this to the unwinder + // using GNU_ARGS_SIZE. Note that this may be necessary even when + // Amount == 0, because the preceding function may have set a non-0 + // GNU_ARGS_SIZE. + // TODO: We don't need to reset this between subsequent functions, + // if it didn't change. + bool HasDwarfEHHandlers = !WindowsCFI && + !MF.getMMI().getLandingPads().empty(); + + if (HasDwarfEHHandlers && !isDestroy && + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences()) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); + + if (Amount == 0) + return; + // Factor out the amount that gets handled inside the sequence // (Pushes of argument for frame setup, callee pops for frame destroy) Amount -= InternalAmt; + // TODO: This is needed only if we require precise CFA. + // If this is a callee-pop calling convention, emit a CFA adjust for + // the amount the callee popped. + if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); + if (Amount) { // Add Amount to SP to destroy a frame, and subtract to setup. int Offset = isDestroy ? Amount : -Amount; - BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + + if (!(Fn->optForMinSize() && + adjustStackWithPops(MBB, I, DL, Offset))) + BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + } + + if (DwarfCFI && !hasFP(MF)) { + // If we don't have FP, but need to generate unwind information, + // we need to set the correct CFA offset after the stack adjustment. + // How much we adjust the CFA offset depends on whether we're emitting + // CFI only for EH purposes or for debugging. EH only requires the CFA + // offset to be correct at each call site, while for debugging we want + // it to be more precise. + int CFAOffset = Amount; + // TODO: When not using precise CFA, we also need to adjust for the + // InternalAmt here. + + if (CFAOffset) { + CFAOffset = isDestroy ? -CFAOffset : CFAOffset; + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset)); + } } + return; } @@ -1933,12 +2585,136 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { assert(MBB.getParent() && "Block is not attached to a function!"); + // Win64 has strict requirements in terms of epilogue and we are + // not taking a chance at messing with them. + // I.e., unless this block is already an exit block, we can't use + // it as an epilogue. + if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock()) + return false; + if (canUseLEAForSPInEpilogue(*MBB.getParent())) return true; // If we cannot use LEA to adjust SP, we may need to use ADD, which - // clobbers the EFLAGS. Check that none of the terminators reads the - // EFLAGS, and if one uses it, conservatively assume this is not + // clobbers the EFLAGS. Check that we do not need to preserve it, + // otherwise, conservatively assume this is not // safe to insert the epilogue here. - return !terminatorsNeedFlagsAsInput(MBB); + return !flagsNeedToBePreservedBeforeTheTerminators(MBB); +} + +bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + // If we may need to emit frameless compact unwind information, give + // up as this is currently broken: PR25614. + return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF); +} + +MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool RestoreSP) const { + assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env"); + assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32"); + assert(STI.is32Bit() && !Uses64BitFramePtr && + "restoring EBP/ESI on non-32-bit target"); + + MachineFunction &MF = *MBB.getParent(); + unsigned FramePtr = TRI->getFrameRegister(MF); + unsigned BasePtr = TRI->getBaseRegister(); + WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // FIXME: Don't set FrameSetup flag in catchret case. + + int FI = FuncInfo.EHRegNodeFrameIndex; + int EHRegSize = MFI->getObjectSize(FI); + + if (RestoreSP) { + // MOV32rm -EHRegSize(%ebp), %esp + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP), + X86::EBP, true, -EHRegSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + unsigned UsedReg; + int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg); + int EndOffset = -EHRegOffset - EHRegSize; + FuncInfo.EHRegNodeEndOffset = EndOffset; + + if (UsedReg == FramePtr) { + // ADD $offset, %ebp + unsigned ADDri = getADDriOpcode(false, EndOffset); + BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr) + .addReg(FramePtr) + .addImm(EndOffset) + .setMIFlag(MachineInstr::FrameSetup) + ->getOperand(3) + .setIsDead(); + assert(EndOffset >= 0 && + "end of registration object above normal EBP position!"); + } else if (UsedReg == BasePtr) { + // LEA offset(%ebp), %esi + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr), + FramePtr, false, EndOffset) + .setMIFlag(MachineInstr::FrameSetup); + // MOV32rm SavedEBPOffset(%esi), %ebp + assert(X86FI->getHasSEHFramePtrSave()); + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), + UsedReg, true, Offset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr"); + } + return MBBI; +} + +unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { + // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. + unsigned Offset = 16; + // RBP is immediately pushed. + Offset += SlotSize; + // All callee-saved registers are then pushed. + Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // Every funclet allocates enough stack space for the largest outgoing call. + Offset += getWinEHFuncletFrameSize(MF); + return Offset; +} + +void X86FrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + // If this function isn't doing Win64-style C++ EH, we don't need to do + // anything. + const Function *Fn = MF.getFunction(); + if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() || + classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) + return; + + // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset + // relative to RSP after the prologue. Find the offset of the last fixed + // object, so that we can allocate a slot immediately following it. If there + // were no fixed objects, use offset -SlotSize, which is immediately after the + // return address. Fixed objects have negative frame indices. + MachineFrameInfo *MFI = MF.getFrameInfo(); + int64_t MinFixedObjOffset = -SlotSize; + for (int I = MFI->getObjectIndexBegin(); I < 0; ++I) + MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I)); + + int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; + int UnwindHelpFI = + MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI; + + // Store -2 into UnwindHelp on function entry. We have to scan forwards past + // other frame setup instructions. + MachineBasicBlock &MBB = MF.front(); + auto MBBI = MBB.begin(); + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + DebugLoc DL = MBB.findDebugLoc(MBBI); + addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)), + UnwindHelpFI) + .addImm(-2); } diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 495cfcd1c3f7..3ab41b4a5789 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -47,11 +47,17 @@ public: unsigned StackPtr; - /// Emit a call to the target's stack probe function. This is required for all + /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize - /// the number of bytes to probe in RAX/EAX. - void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL) const; + /// the number of bytes to probe in RAX/EAX. Returns instruction just + /// after the expansion. + MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; + + /// Replace a StackProbe inline-stub with the actual probe code inline. + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -91,11 +97,9 @@ public: bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; bool needsFrameIndexResolution(const MachineFunction &MF) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; @@ -103,6 +107,11 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; + /// Check the instruction before/after the passed instruction. If /// it is an ADD/SUB/LEA instruction it is deleted argument and the /// stack adjustment is returned as a positive value for ADD/LEA and @@ -125,7 +134,9 @@ public: /// \p MBB will be correctly handled by the target. bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; -private: + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override; + /// convertArgMovsToPushes - This method tries to convert a call sequence /// that uses sub and mov instructions to put the argument onto the stack /// into a series of pushes. @@ -135,22 +146,56 @@ private: MachineBasicBlock::iterator I, uint64_t Amount) const; - uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; - /// Wraps up getting a CFI index and building a MachineInstr for it. void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, MCCFIInstruction CFIInst) const; + /// Sets up EBP and optionally ESI based on the incoming EBP value. Only + /// needed for 32-bit. Used in funclet prologues and at catchret destinations. + MachineBasicBlock::iterator + restoreWin32EHStackPointers(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool RestoreSP = false) const; + +private: + uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; + + /// Emit target stack probe as a call to a helper function + MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit target stack probe as an inline sequence. + MachineInstr *emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit a stub to later inline the target stack probe. + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - uint64_t MaxAlign) const; + unsigned Reg, uint64_t MaxAlign) const; + + /// Make small positive stack adjustments using POPs. + bool adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + int Offset) const; /// Adjusts the stack pointer using LEA, SUB, or ADD. MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, int64_t Offset, bool InEpilogue) const; + + unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const; + + unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index d5351d25d6ed..4414e478b99b 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -46,9 +46,8 @@ STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); //===----------------------------------------------------------------------===// namespace { - /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses - /// SDValue's instead of register numbers for the leaves of the matched - /// tree. + /// This corresponds to X86AddressMode, but uses SDValue's instead of register + /// numbers for the leaves of the matched tree. struct X86ISelAddressMode { enum { RegBase, @@ -87,8 +86,7 @@ namespace { IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } - /// isRIPRelative - Return true if this addressing mode is already RIP - /// relative. + /// Return true if this addressing mode is already RIP-relative. bool isRIPRelative() const { if (BaseType != RegBase) return false; if (RegisterSDNode *RegNode = @@ -147,21 +145,25 @@ namespace { namespace { //===--------------------------------------------------------------------===// - /// ISel - X86 specific code to select X86 machine instructions for + /// ISel - X86-specific code to select X86 machine instructions for /// SelectionDAG operations. /// class X86DAGToDAGISel final : public SelectionDAGISel { - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// OptForSize - If true, selector should try to optimize for code size - /// instead of performance. + /// If true, selector should try to optimize for code size instead of + /// performance. bool OptForSize; + /// If true, selector should try to optimize for minimum code size. + bool OptForMinSize; + public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false), + OptForMinSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -184,8 +186,7 @@ namespace { return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); } - // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit - // sign extended field. + // True if the 64-bit immediate fits in a 32-bit sign-extended field. inline bool i64immSExt32(SDNode *N) const { uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); return (int64_t)v == (int32_t)v; @@ -196,50 +197,50 @@ namespace { private: SDNode *Select(SDNode *N) override; - SDNode *SelectGather(SDNode *N, unsigned Opc); - SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); - - bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); - bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); - bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); - bool MatchAddress(SDValue N, X86ISelAddressMode &AM); - bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + SDNode *selectGather(SDNode *N, unsigned Opc); + SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT); + + bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); + bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); + bool matchWrapper(SDValue N, X86ISelAddressMode &AM); + bool matchAddress(SDValue N, X86ISelAddressMode &AM); + bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); - bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM); - bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); + bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectMOV64Imm32(SDValue N, SDValue &Imm); - bool SelectLEAAddr(SDValue N, SDValue &Base, + bool selectMOV64Imm32(SDValue N, SDValue &Imm); + bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectLEA64_32Addr(SDValue N, SDValue &Base, + bool selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectTLSADDRAddr(SDValue N, SDValue &Base, + bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectScalarSSELoad(SDNode *Root, SDValue N, + bool selectScalarSSELoad(SDNode *Root, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &NodeWithChain); - bool TryFoldLoad(SDNode *P, SDValue N, + bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for - /// inline asm expressions. + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; - void EmitSpecialCodeForMain(); + void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL, SDValue &Base, SDValue &Scale, @@ -252,7 +253,7 @@ namespace { : AM.Base_Reg; Scale = getI8Imm(AM.Scale, DL); Index = AM.IndexReg; - // These are 32-bit even in 64-bit mode since RIP relative offset + // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), @@ -283,32 +284,105 @@ namespace { Segment = CurDAG->getRegister(0, MVT::i32); } - /// getI8Imm - Return a target constant with the specified value, of type - /// i8. + // Utility function to determine whether we should avoid selecting + // immediate forms of instructions for better code size or not. + // At a high level, we'd like to avoid such instructions when + // we have similar constants used within the same basic block + // that can be kept in a register. + // + bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { + uint32_t UseCount = 0; + + // Do not want to hoist if we're not optimizing for size. + // TODO: We'd like to remove this restriction. + // See the comment in X86InstrInfo.td for more info. + if (!OptForSize) + return false; + + // Walk all the users of the immediate. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { + + SDNode *User = *UI; + + // This user is already selected. Count it as a legitimate use and + // move on. + if (User->isMachineOpcode()) { + UseCount++; + continue; + } + + // We want to count stores of immediates as real uses. + if (User->getOpcode() == ISD::STORE && + User->getOperand(1).getNode() == N) { + UseCount++; + continue; + } + + // We don't currently match users that have > 2 operands (except + // for stores, which are handled above) + // Those instruction won't match in ISEL, for now, and would + // be counted incorrectly. + // This may change in the future as we add additional instruction + // types. + if (User->getNumOperands() != 2) + continue; + + // Immediates that are used for offsets as part of stack + // manipulation should be left alone. These are typically + // used to indicate SP offsets for argument passing and + // will get pulled into stores/pushes (implicitly). + if (User->getOpcode() == X86ISD::ADD || + User->getOpcode() == ISD::ADD || + User->getOpcode() == X86ISD::SUB || + User->getOpcode() == ISD::SUB) { + + // Find the other operand of the add/sub. + SDValue OtherOp = User->getOperand(0); + if (OtherOp.getNode() == N) + OtherOp = User->getOperand(1); + + // Don't count if the other operand is SP. + RegisterSDNode *RegNode; + if (OtherOp->getOpcode() == ISD::CopyFromReg && + (RegNode = dyn_cast_or_null<RegisterSDNode>( + OtherOp->getOperand(1).getNode()))) + if ((RegNode->getReg() == X86::ESP) || + (RegNode->getReg() == X86::RSP)) + continue; + } + + // ... otherwise, count this and move on. + UseCount++; + } + + // If we have more than 1 use, then recommend for hoisting. + return (UseCount > 1); + } + + /// Return a target constant with the specified value of type i8. inline SDValue getI8Imm(unsigned Imm, SDLoc DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i8); } - /// getI32Imm - Return a target constant with the specified value, of type - /// i32. + /// Return a target constant with the specified value, of type i32. inline SDValue getI32Imm(unsigned Imm, SDLoc DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } - /// getGlobalBaseReg - Return an SDNode that returns the value of - /// the global base register. Output instructions required to - /// initialize the global base register, if necessary. - /// + /// Return an SDNode that returns the value of the global base register. + /// Output instructions required to initialize the global base register, + /// if necessary. SDNode *getGlobalBaseReg(); - /// getTargetMachine - Return a reference to the TargetMachine, casted - /// to the target-specific type. + /// Return a reference to the TargetMachine, casted to the target-specific + /// type. const X86TargetMachine &getTargetMachine() const { return static_cast<const X86TargetMachine &>(TM); } - /// getInstrInfo - Return a reference to the TargetInstrInfo, casted - /// to the target-specific type. + /// Return a reference to the TargetInstrInfo, casted to the target-specific + /// type. const X86InstrInfo *getInstrInfo() const { return Subtarget->getInstrInfo(); } @@ -386,9 +460,9 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { return true; } -/// MoveBelowCallOrigChain - Replace the original chain operand of the call with +/// Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. -static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, +static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain) { SmallVector<SDValue, 8> Ops; SDValue Chain = OrigChain.getOperand(0); @@ -418,7 +492,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } -/// isCalleeLoad - Return true if call address is a load and it can be +/// Return true if call address is a load and it can be /// moved below CALLSEQ_START and the chains leading up to the call. /// Return the CALLSEQ_START by reference as a second output. /// In the case of a tail call, there isn't a callseq node between the call @@ -461,12 +535,14 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + // OptFor[Min]Size are used in pattern predicates that isel is matching. + OptForSize = MF->getFunction()->optForSize(); + OptForMinSize = MF->getFunction()->optForMinSize(); + assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { - SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && // Only does this when target favors doesn't favor register indirect @@ -500,7 +576,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDValue Load = N->getOperand(1); if (!isCalleeLoad(Load, Chain, HasCallSeq)) continue; - MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); + moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); ++NumLoadMoved; continue; } @@ -577,9 +653,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } -/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in -/// the main function. -void X86DAGToDAGISel::EmitSpecialCodeForMain() { +/// Emit any code that needs to be executed only in the main function. +void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { TargetLowering::ArgListTy Args; auto &DL = CurDAG->getDataLayout(); @@ -599,7 +674,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() { // If this is main, emit special code for main. if (const Function *Fn = MF->getFunction()) if (Fn->hasExternalLinkage() && Fn->getName() == "main") - EmitSpecialCodeForMain(); + emitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { @@ -612,7 +687,7 @@ static bool isDispSafeForFrameIndex(int64_t Val) { return isInt<31>(Val); } -bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, +bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { // Cannot combine ExternalSymbol displacements with integer offsets. if (Offset != 0 && (AM.ES || AM.MCSym)) @@ -634,7 +709,7 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, } -bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ +bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ SDValue Address = N->getOperand(1); // load gs:0 -> GS segment register. @@ -658,11 +733,10 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ return true; } -/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes -/// into an addressing mode. These wrap things that will resolve down into a -/// symbol reference. If no match is possible, this returns true, otherwise it -/// returns false. -bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { +/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing +/// mode. These wrap things that will resolve down into a symbol reference. +/// If no match is possible, this returns true, otherwise it returns false. +bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { // If the addressing mode already has a symbol as the displacement, we can // never match another symbol. if (AM.hasSymbolicDisplacement()) @@ -685,7 +759,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { X86ISelAddressMode Backup = AM; AM.GV = G->getGlobal(); AM.SymbolFlags = G->getTargetFlags(); - if (FoldOffsetIntoAddress(G->getOffset(), AM)) { + if (foldOffsetIntoAddress(G->getOffset(), AM)) { AM = Backup; return true; } @@ -694,7 +768,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { AM.CP = CP->getConstVal(); AM.Align = CP->getAlignment(); AM.SymbolFlags = CP->getTargetFlags(); - if (FoldOffsetIntoAddress(CP->getOffset(), AM)) { + if (foldOffsetIntoAddress(CP->getOffset(), AM)) { AM = Backup; return true; } @@ -710,7 +784,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { X86ISelAddressMode Backup = AM; AM.BlockAddr = BA->getBlockAddress(); AM.SymbolFlags = BA->getTargetFlags(); - if (FoldOffsetIntoAddress(BA->getOffset(), AM)) { + if (foldOffsetIntoAddress(BA->getOffset(), AM)) { AM = Backup; return true; } @@ -758,11 +832,10 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { return true; } -/// MatchAddress - Add the specified node to the specified addressing mode, -/// returning true if it cannot be done. This just pattern matches for the -/// addressing mode. -bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { - if (MatchAddressRecursively(N, AM, 0)) +/// Add the specified node to the specified addressing mode, returning true if +/// it cannot be done. This just pattern matches for the addressing mode. +bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { + if (matchAddressRecursively(N, AM, 0)) return true; // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has @@ -790,15 +863,49 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { return false; } +bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + return false; + AM = Backup; + + // If we couldn't fold both operands into the address at the same time, + // see if we can just put each operand into a register and fold at least + // the add. + if (AM.BaseType == X86ISelAddressMode::RegBase && + !AM.Base_Reg.getNode() && + !AM.IndexReg.getNode()) { + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); + AM.Scale = 1; + return false; + } + N = Handle.getValue(); + return true; +} + // Insert a node into the DAG at least before the Pos node's position. This // will reposition the node as needed, and will assign it a node ID that is <= // the Pos node's ID. Note that this does *not* preserve the uniqueness of node // IDs! The selection DAG must no longer depend on their uniqueness when this // is used. -static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { +static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { if (N.getNode()->getNodeId() == -1 || N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { - DAG.RepositionNode(Pos.getNode(), N.getNode()); + DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); N.getNode()->setNodeId(Pos.getNode()->getNodeId()); } } @@ -807,7 +914,7 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { // safe. This allows us to convert the shift and and into an h-register // extract and a scaled index. Returns false if the simplification is // performed. -static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, +static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -835,12 +942,12 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, Eight); - InsertDAGNode(DAG, N, Srl); - InsertDAGNode(DAG, N, NewMask); - InsertDAGNode(DAG, N, And); - InsertDAGNode(DAG, N, ShlCount); - InsertDAGNode(DAG, N, Shl); + insertDAGNode(DAG, N, Eight); + insertDAGNode(DAG, N, Srl); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, And); + insertDAGNode(DAG, N, ShlCount); + insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); @@ -850,7 +957,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. -static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, +static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -880,9 +987,9 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, NewMask); - InsertDAGNode(DAG, N, NewAnd); - InsertDAGNode(DAG, N, NewShift); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, NewAnd); + insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); AM.Scale = 1 << ShiftAmt; @@ -917,7 +1024,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, // Note that this function assumes the mask is provided as a mask *after* the // value is shifted. The input chain may or may not match that, but computing // such a mask is trivial. -static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, +static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -973,7 +1080,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); - InsertDAGNode(DAG, N, NewX); + insertDAGNode(DAG, N, NewX); X = NewX; } SDLoc DL(N); @@ -987,10 +1094,10 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, NewSRLAmt); - InsertDAGNode(DAG, N, NewSRL); - InsertDAGNode(DAG, N, NewSHLAmt); - InsertDAGNode(DAG, N, NewSHL); + insertDAGNode(DAG, N, NewSRLAmt); + insertDAGNode(DAG, N, NewSRL); + insertDAGNode(DAG, N, NewSHLAmt); + insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); AM.Scale = 1 << AMShiftAmt; @@ -998,7 +1105,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, return false; } -bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, +bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); DEBUG({ @@ -1007,7 +1114,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, }); // Limit recursion. if (Depth > 5) - return MatchAddressBase(N, AM); + return matchAddressBase(N, AM); // If this is already a %rip relative address, we can only merge immediates // into it. Instead of handling this in every case, we handle it here. @@ -1020,7 +1127,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, return true; if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) - if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) + if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) return false; return true; } @@ -1038,19 +1145,19 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } case ISD::Constant: { uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); - if (!FoldOffsetIntoAddress(Val, AM)) + if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: case X86ISD::WrapperRIP: - if (!MatchWrapper(N, AM)) + if (!matchWrapper(N, AM)) return false; break; case ISD::LOAD: - if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM)) + if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) return false; break; @@ -1087,7 +1194,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; - if (!FoldOffsetIntoAddress(Disp, AM)) + if (!foldOffsetIntoAddress(Disp, AM)) return false; } @@ -1119,7 +1226,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Try to fold the mask and shift into the scale, and return false if we // succeed. - if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) return false; break; } @@ -1153,7 +1260,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ConstantSDNode *AddVal = cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); - if (FoldOffsetIntoAddress(Disp, AM)) + if (foldOffsetIntoAddress(Disp, AM)) Reg = N.getNode()->getOperand(0); } else { Reg = N.getNode()->getOperand(0); @@ -1179,7 +1286,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; - if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { AM = Backup; break; } @@ -1227,56 +1334,26 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.Scale = 1; // Insert the new nodes into the topological ordering. - InsertDAGNode(*CurDAG, N, Zero); - InsertDAGNode(*CurDAG, N, Neg); + insertDAGNode(*CurDAG, N, Zero); + insertDAGNode(*CurDAG, N, Neg); return false; } - case ISD::ADD: { - // Add an artificial use to this node so that we can keep track of - // it if it gets CSE'd with a different node. - HandleSDNode Handle(N); - - X86ISelAddressMode Backup = AM; - if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) - return false; - AM = Backup; - - // Try again after commuting the operands. - if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& - !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + case ISD::ADD: + if (!matchAdd(N, AM, Depth)) return false; - AM = Backup; - - // If we couldn't fold both operands into the address at the same time, - // see if we can just put each operand into a register and fold at least - // the add. - if (AM.BaseType == X86ISelAddressMode::RegBase && - !AM.Base_Reg.getNode() && - !AM.IndexReg.getNode()) { - N = Handle.getValue(); - AM.Base_Reg = N.getOperand(0); - AM.IndexReg = N.getOperand(1); - AM.Scale = 1; - return false; - } - N = Handle.getValue(); break; - } case ISD::OR: - // Handle "X | C" as "X + C" iff X is known to have C bits clear. - if (CurDAG->isBaseWithConstantOffset(N)) { - X86ISelAddressMode Backup = AM; - ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1)); - - // Start with the LHS as an addr mode. - if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - !FoldOffsetIntoAddress(CN->getSExtValue(), AM)) - return false; - AM = Backup; - } + // We want to look through a transform in InstCombine and DAGCombiner that + // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. + // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) + // An 'lea' can then be used to match the shift (multiply) and add: + // and $1, %esi + // lea (%rsi, %rdi, 8), %rax + if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && + !matchAdd(N, AM, Depth)) + return false; break; case ISD::AND: { @@ -1299,27 +1376,27 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, uint64_t Mask = N.getConstantOperandVal(1); // Try to fold the mask and shift into an extract and scale. - if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift directly into the scale. - if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. - if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) return false; break; } } - return MatchAddressBase(N, AM); + return matchAddressBase(N, AM); } -/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the +/// Helper for MatchAddress. Add the specified node to the /// specified addressing mode without any further recursion. -bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { +bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { // Is the base register already occupied? if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { // If so, check to see if the scale index register is set. @@ -1339,7 +1416,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } -bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1362,7 +1439,7 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, // If Base is 0, the whole address is in index and the Scale is 1 if (isa<ConstantSDNode>(Base)) { - assert(dyn_cast<ConstantSDNode>(Base)->isNullValue() && + assert(cast<ConstantSDNode>(Base)->isNullValue() && "Unexpected base in gather/scatter"); Scale = getI8Imm(1, DL); Base = CurDAG->getRegister(0, MVT::i32); @@ -1375,14 +1452,14 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -/// SelectAddr - returns true if it is able pattern match an addressing mode. +/// Returns true if it is able to pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. /// /// Parent is the parent node of the addr operand that is being matched. It /// is always a load, store, atomic node, or null. It is only null when /// checking memory operands for inline asm nodes. -bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; @@ -1404,7 +1481,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); } - if (MatchAddress(N, AM)) + if (matchAddress(N, AM)) return false; MVT VT = N.getSimpleValueType(); @@ -1420,14 +1497,14 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to -/// match a load whose top elements are either undef or zeros. The load flavor -/// is derived from the type of N, which is either v4f32 or v2f64. +/// Match a scalar SSE load. In particular, we want to match a load whose top +/// elements are either undef or zeros. The load flavor is derived from the +/// type of N, which is either v4f32 or v2f64. /// /// We also return: /// PatternChainNode: this is the matched node that has a chain input and /// output. -bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, +bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, @@ -1439,7 +1516,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); - if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; return true; } @@ -1457,7 +1534,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { // Okay, this is a zero extending load. Fold it. LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); - if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; PatternNodeWithChain = SDValue(LD, 0); return true; @@ -1466,7 +1543,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, } -bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { +bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { uint64_t ImmVal = CN->getZExtValue(); if ((uint32_t)ImmVal != (uint64_t)ImmVal) @@ -1495,10 +1572,10 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { return TM.getCodeModel() == CodeModel::Small; } -bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment)) + if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) return false; SDLoc DL(N); @@ -1533,9 +1610,9 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, return true; } -/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing +/// Calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. -bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, +bool X86DAGToDAGISel::selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1546,7 +1623,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, SDValue Copy = AM.Segment; SDValue T = CurDAG->getRegister(0, MVT::i32); AM.Segment = T; - if (MatchAddress(N, AM)) + if (matchAddress(N, AM)) return false; assert (T == AM.Segment); AM.Segment = Copy; @@ -1572,13 +1649,12 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, Complexity++; // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA - // to a LEA. This is determined with some expermentation but is by no means + // to a LEA. This is determined with some experimentation but is by no means // optimal (especially for code size consideration). LEA is nice because of // its three-address nature. Tweak the cost function again when we can run // convertToThreeAddress() at register allocation time. if (AM.hasSymbolicDisplacement()) { - // For X86-64, we should always use lea to materialize RIP relative - // addresses. + // For X86-64, always use LEA to materialize RIP-relative addresses. if (Subtarget->is64Bit()) Complexity = 4; else @@ -1596,8 +1672,8 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, return true; } -/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes. -bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, +/// This is only run on TargetGlobalTLSAddress nodes. +bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); @@ -1621,7 +1697,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, } -bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, +bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1630,14 +1706,13 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, !IsLegalToFold(N, P, P, OptLevel)) return false; - return SelectAddr(N.getNode(), + return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } -/// getGlobalBaseReg - Return an SDNode that returns the value of -/// the global base register. Output instructions required to -/// initialize the global base register, if necessary. -/// +/// Return an SDNode that returns the value of the global base register. +/// Output instructions required to initialize the global base register, +/// if necessary. SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); auto &DL = MF->getDataLayout(); @@ -1828,7 +1903,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, return Val; } -SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { +SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) { if (Node->hasAnyUseOfValue(0)) return nullptr; @@ -1841,7 +1916,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); SDValue Base, Scale, Index, Disp, Segment; - if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) + if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) return nullptr; // Which index into the table. @@ -1933,9 +2008,9 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { return CurDAG->getMergeValues(RetVals, dl).getNode(); } -/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has -/// any uses which require the SF or OF bits to be accurate. -static bool HasNoSignedComparisonUses(SDNode *N) { +/// Test whether the given X86ISD::CMP node has any uses which require the SF +/// or OF bits to be accurate. +static bool hasNoSignedComparisonUses(SDNode *N) { // Examine each user of the node. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { @@ -1995,9 +2070,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) { return true; } -/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode -/// is suitable for doing the {load; increment or decrement; store} to modify -/// transformation. +/// Check whether or not the chain ending in StoreNode is suitable for doing +/// the {load; increment or decrement; store} to modify transformation. static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode* &LoadNode, SDValue &InputChain) { @@ -2081,8 +2155,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, return true; } -/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory -/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC. +/// Get the appropriate X86 opcode for an in-memory increment or decrement. +/// Opc should be X86ISD::DEC or X86ISD::INC. static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { if (Opc == X86ISD::DEC) { if (LdVT == MVT::i64) return X86::DEC64m; @@ -2099,9 +2173,8 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { llvm_unreachable("unrecognized size for LdVT"); } -/// SelectGather - Customized ISel for GATHER operations. -/// -SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { +/// Customized ISel for GATHER operations. +SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) { // Operands of Gather: VSrc, Base, VIdx, VMask, Scale SDValue Chain = Node->getOperand(0); SDValue VSrc = Node->getOperand(2); @@ -2148,6 +2221,27 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::BRIND: { + if (Subtarget->isTargetNaCl()) + // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We + // leave the instruction alone. + break; + if (Subtarget->isTarget64BitILP32()) { + // Converts a 32-bit register to a 64-bit, zero-extended version of + // it. This is needed because x86-64 can do many things, but jmp %r32 + // ain't one of them. + const SDValue &Target = Node->getOperand(1); + assert(Target.getSimpleValueType() == llvm::MVT::i32); + SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); + SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, + Node->getOperand(0), ZextTarget); + ReplaceUses(SDValue(Node, 0), Brind); + SelectCode(ZextTarget.getNode()); + SelectCode(Brind.getNode()); + return nullptr; + } + break; + } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); switch (IntNo) { @@ -2190,7 +2284,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; } - SDNode *RetVal = SelectGather(Node, Opc); + SDNode *RetVal = selectGather(Node, Opc); if (RetVal) // We already called ReplaceUses inside SelectGather. return nullptr; @@ -2217,7 +2311,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + SDNode *RetVal = selectAtomicLoadArith(Node, NVT); if (RetVal) return RetVal; break; @@ -2404,10 +2498,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commmutative. if (!foldedLoad) { - foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (foldedLoad) std::swap(N0, N1); } @@ -2549,7 +2643,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; @@ -2557,7 +2651,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain; - if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, @@ -2692,7 +2786,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) + hasNoSignedComparisonUses(Node)) N0 = N0.getOperand(0); // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to @@ -2709,7 +2803,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // For example, convert "testl %eax, $8" to "testb %al, $8" if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && (!(C->getZExtValue() & 0x80) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Reg = N0.getNode()->getOperand(0); @@ -2743,7 +2837,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // For example, "testl %eax, $2048" to "testb %ah, $8". if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && (!(C->getZExtValue() & 0x8000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, dl, MVT::i8); @@ -2781,7 +2875,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && N0.getValueType() != MVT::i16 && (!(C->getZExtValue() & 0x8000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i16); SDValue Reg = N0.getNode()->getOperand(0); @@ -2804,7 +2898,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && N0.getValueType() == MVT::i64 && (!(C->getZExtValue() & 0x80000000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i32); SDValue Reg = N0.getNode()->getOperand(0); @@ -2854,7 +2948,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { break; SDValue Base, Scale, Index, Disp, Segment; - if (!SelectAddr(LoadNode, LoadNode->getBasePtr(), + if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, Segment)) break; @@ -2903,7 +2997,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory case InlineAsm::Constraint_X: - if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) + if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; } @@ -2916,9 +3010,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, return false; } -/// createX86ISelDag - This pass converts a legalized DAG into a -/// X86-specific DAG, ready for instruction scheduling. -/// +/// This pass converts a legalized DAG into a X86-specific DAG, +/// ready for instruction scheduling. FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0f29b514146c..0927c2f4fa50 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -67,19 +68,14 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -// Forward declarations. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2); - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - TD = TM.getDataLayout(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. - static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); @@ -118,13 +114,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - - // The _ftol2 runtime function has an unusual calling conv, which - // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { @@ -175,14 +164,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) + // f32/f64 are legal, f80 is custom. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + else + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget->useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit - // FILD for other targets. + // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } @@ -206,23 +199,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } - // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 - // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + if (!Subtarget->useSoftFloat()) { + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed @@ -232,8 +231,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } else { + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + } } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) @@ -242,14 +247,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else + // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - } - if (isTargetFTOL()) { - // Use the _ftol2 runtime function, which has a pseudo-instruction - // to handle its weird calling convention. setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } @@ -274,8 +276,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { - MVT VT = IntVTs[i]; + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); @@ -295,6 +296,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); @@ -302,6 +304,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); @@ -312,7 +315,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); + + if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` + // is. We should promote the value to 64-bits to solve this. + // This is what the CRT headers do - `fmodf` is an inline header + // function casting to f64 and calling `fmod`. + setOperationAction(ISD::FREM , MVT::f32 , Promote); + } else { + setOperationAction(ISD::FREM , MVT::f32 , Expand); + } + setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); @@ -404,15 +417,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); + setOperationAction(ISD::SETCCE , MVT::i8 , Custom); + setOperationAction(ISD::SETCCE , MVT::i16 , Custom); + setOperationAction(ISD::SETCCE , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); + setOperationAction(ISD::SETCCE , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -456,8 +475,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics - for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { - MVT VT = IntVTs[i]; + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); @@ -473,13 +491,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - if (Subtarget->is64Bit()) { - setExceptionPointerRegister(X86::RAX); - setExceptionSelectorRegister(X86::RDX); - } else { - setExceptionPointerRegister(X86::EAX); - setExceptionSelectorRegister(X86::EDX); - } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); @@ -492,8 +503,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { - // TargetInfo::X86_64ABIBuiltinVaList + if (Subtarget->is64Bit()) { setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { @@ -505,7 +515,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); @@ -613,8 +623,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87. + // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -846,15 +864,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + // ISD::CTTZ v2i64 - scalarization is faster. + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. - for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - // Do not attempt to custom lower non-power-of-2 vectors - if (!isPowerOf2_32(VT.getVectorNumElements())) - continue; - // Do not attempt to custom lower non-128-bit vectors - if (!VT.is128BitVector()) - continue; + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -892,13 +912,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. - for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-128-bit vectors - if (!VT.is128BitVector()) - continue; - + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); @@ -1036,6 +1050,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + if (Subtarget->hasXOP()) { + setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + setOperationAction(ISD::ROTL, MVT::v8i16, Custom); + setOperationAction(ISD::ROTL, MVT::v4i32, Custom); + setOperationAction(ISD::ROTL, MVT::v2i64, Custom); + setOperationAction(ISD::ROTL, MVT::v32i8, Custom); + setOperationAction(ISD::ROTL, MVT::v16i16, Custom); + setOperationAction(ISD::ROTL, MVT::v8i32, Custom); + setOperationAction(ISD::ROTL, MVT::v4i64, Custom); + } + if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); @@ -1126,7 +1151,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -1202,6 +1236,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::SMAX, MVT::v32i8, Custom); + setOperationAction(ISD::SMAX, MVT::v16i16, Custom); + setOperationAction(ISD::SMAX, MVT::v8i32, Custom); + setOperationAction(ISD::UMAX, MVT::v32i8, Custom); + setOperationAction(ISD::UMAX, MVT::v16i16, Custom); + setOperationAction(ISD::UMAX, MVT::v8i32, Custom); + setOperationAction(ISD::SMIN, MVT::v32i8, Custom); + setOperationAction(ISD::SMIN, MVT::v16i16, Custom); + setOperationAction(ISD::SMIN, MVT::v8i32, Custom); + setOperationAction(ISD::UMIN, MVT::v32i8, Custom); + setOperationAction(ISD::UMIN, MVT::v16i16, Custom); + setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be @@ -1243,15 +1290,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasInt256()) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. - for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-256-bit vectors - if (!VT.is256BitVector()) - continue; - + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); @@ -1293,6 +1333,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); @@ -1311,6 +1352,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); @@ -1318,19 +1360,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); - } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); @@ -1348,12 +1381,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } else { + setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); + setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); + } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + } + } + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); @@ -1386,7 +1469,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); @@ -1395,6 +1478,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); @@ -1439,9 +1523,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - } + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); + + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); + + if (Subtarget->hasVLX()) { + setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + } else { + setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + } + } // Subtarget->hasCDI() + if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); @@ -1455,7 +1579,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1481,15 +1605,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } - for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-512-bit vectors. - if (!VT.is512BitVector()) - continue; - + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } @@ -1515,22 +1635,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1541,19 +1674,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); - for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { - const MVT VT = (MVT::SimpleValueType)i; + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); - const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); + } - // Do not attempt to promote non-512-bit vectors. - if (!VT.is512BitVector()) - continue; + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); - if (EltSize < 32) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); - } + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } @@ -1571,6 +1716,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -1595,8 +1742,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (!Subtarget->is64Bit()) + if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1604,9 +1753,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. - for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget->is64Bit()) + continue; // Add/Sub/Mul with overflow operations are custom lowered. - MVT VT = IntVTs[i]; setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); @@ -1615,7 +1765,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, VT, Custom); } - if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); @@ -1658,12 +1807,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -1671,24 +1824,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); - // On Darwin, -Os means optimize for size without hurting performance, - // do not reduce the limit. MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. - // Predictable cmov don't hurt on atom because it's in-order. + // A predictable cmov does not hurt on an in-order CPU. + // FIXME: Use a CPU attribute to trigger this, not a CPU model. PredictableSelectIsExpensive = !Subtarget->isAtom(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. @@ -1716,40 +1869,43 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - const unsigned NumElts = VT.getVectorNumElements(); - const EVT EltVT = VT.getVectorElementType(); - if (VT.is512BitVector()) { - if (Subtarget->hasAVX512()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - } - if (Subtarget->hasBWI()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 32: return MVT::v32i1; - case 64: return MVT::v64i1; - } - } + if (VT.isSimple()) { + MVT VVT = VT.getSimpleVT(); + const unsigned NumElts = VVT.getVectorNumElements(); + const MVT EltVT = VVT.getVectorElementType(); + if (VVT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } - if (VT.is256BitVector() || VT.is128BitVector()) { - if (Subtarget->hasVLX()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 2: return MVT::v2i1; - case 4: return MVT::v4i1; - case 8: return MVT::v8i1; - } - if (Subtarget->hasBWI() && Subtarget->hasVLX()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - case 32: return MVT::v32i1; - } + if (VVT.is256BitVector() || VVT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } + } } return VT.changeVectorElementTypeToInteger(); @@ -1769,9 +1925,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast<StructType>(Ty)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; - getMaxByValAlign(STy->getElementType(i), EltAlign); + getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) @@ -1821,10 +1977,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (Subtarget->isUnalignedMemAccessFast() || + (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { + // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) @@ -1842,6 +1999,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::f64; } } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; @@ -1860,8 +2020,22 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { - if (Fast) - *Fast = Subtarget->isUnalignedMemAccessFast(); + if (Fast) { + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + *Fast = true; + break; + case 128: + *Fast = !Subtarget->isUnalignedMem16Slow(); + break; + case 256: + *Fast = !Subtarget->isUnalignedMem32Slow(); + break; + // TODO: What about AVX-512 (512-bit) accesses? + } + } + // Misaligned accesses of any size are always allowed. return true; } @@ -1964,6 +2138,32 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, return true; } +Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + unsigned AddressSpace, Offset; + if (Subtarget->is64Bit()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x48; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x24 on i386 + Offset = 0x24; + AddressSpace = 256; + } + + return ConstantExpr::getIntToPtr( + ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); +} + bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); @@ -1977,11 +2177,9 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, #include "X86GenCallingConv.inc" -bool -X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, - MachineFunction &MF, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const { +bool X86TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); @@ -2001,6 +2199,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2025,7 +2226,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { - if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1) + if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); @@ -2114,7 +2315,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); + X86ISD::NodeType opcode = X86ISD::RET_FLAG; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2193,7 +2397,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2244,28 +2448,28 @@ enum StructReturnType { StackStructReturn }; static StructReturnType -callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { +callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType -argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { +argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } @@ -2285,17 +2489,34 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } -/// Return true if the calling convention is one that -/// supports tail call optimization. -static bool IsTailCallConvention(CallingConv::ID CC) { +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE); + CC == CallingConv::HiPE || CC == CallingConv::HHVM); } -/// \brief Return true if the calling convention is a C calling convention. -static bool IsCCallConvention(CallingConv::ID CC) { - return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || - CC == CallingConv::X86_64_SysV); +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + // C calling conventions: + case CallingConv::C: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + // Callee pop conventions: + case CallingConv::X86_ThisCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_VectorCall: + case CallingConv::X86_FastCall: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Return true if the function is being made into a tailcall target by +/// changing its ABI. +static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { + return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { @@ -2306,19 +2527,12 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); - if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) + if (!mayTailCallThisCC(CalleeCC)) return false; return true; } -/// Return true if the function is being made into -/// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, - bool GuaranteedTailCallOpt) { - return GuaranteedTailCallOpt && IsTailCallConvention(CC); -} - SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, @@ -2329,7 +2543,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe( + bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -2344,6 +2558,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, else ValVT = VA.getValVT(); + // Calculate SP offset of interrupt parameter, re-arrange the slot normally + // taken by a return address. + int Offset = 0; + if (CallConv == CallingConv::X86_INTR) { + const X86Subtarget& Subtarget = + static_cast<const X86Subtarget&>(DAG.getSubtarget()); + // X86 interrupts may take one or two arguments. + // On the stack there will be no return address as in regular call. + // Offset of last argument need to be set to -4/-8 bytes. + // Where offset of the first argument out of two, should be set to 0 bytes. + Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + } + // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -2352,14 +2579,24 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } @@ -2413,15 +2650,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } -SDValue -X86TargetLowering::LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc dl, - SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) - const { +SDValue X86TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); @@ -2436,9 +2668,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - assert(!(isVarArg && IsTailCallConvention(CallConv)) && + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); + if (CallConv == CallingConv::X86_INTR) { + bool isLegal = Ins.size() == 1 || + (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || + (!Is64Bit && Ins[1].VT == MVT::i32))); + if (!isLegal) + report_fatal_error("X86 interrupts may take one or two arguments"); + } + // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -2471,6 +2711,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -2547,8 +2789,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv, - MF.getTarget().Options.GuaranteedTailCallOpt)) + if (shouldGuaranteeTCO(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -2561,13 +2803,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MFI->CreateFixedObject(1, StackSize, true)); } - MachineModuleInfo &MMI = MF.getMMI(); - const Function *WinEHParent = nullptr; - if (MMI.hasWinEHFuncInfo(Fn)) - WinEHParent = MMI.getWinEHParent(Fn); - bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; - bool IsWinEHParent = WinEHParent && WinEHParent == Fn; - // Figure out if XMM registers are in use. assert(!(Subtarget->useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && @@ -2631,10 +2866,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); MemOps.push_back(Store); Offset += 8; } @@ -2656,27 +2892,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - } else if (IsWin64 && IsWinEHOutlined) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject( - /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false)); - - MMI.getWinEHFuncInfo(Fn) - .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] = - FuncInfo->getRegSaveFrameIndex(); - - // Store the second integer parameter (rdx) into rsp+16 relative to the - // stack pointer at the entry of the function. - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); - Chain = DAG.getStore( - Val.getValue(1), dl, Val, RSFIN, - MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()), - /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { @@ -2723,12 +2938,15 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code if present + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && + if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && - argsAreStructReturn(Ins) == StackStructReturn) + argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2743,21 +2961,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setArgumentStackSize(StackSize); - if (IsWinEHParent) { - if (Is64Bit) { - int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); - SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); - MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; - SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); - Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, - MachinePointerInfo::getFixedStack(UnwindHelpFI), - /*isVolatile=*/true, - /*isNonTemporal=*/false, /*Alignment=*/0); - } else { - // Functions using Win32 EH are considered to have opaque SP adjustments - // to force local variables to be addressed from the frame or base - // pointers. - MFI->setHasOpaqueSPAdjustment(true); + if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { + EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + assert(Is64Bit); + // TODO: Add a mechanism to frame lowering that will allow us to indicate + // that we'd prefer this slot be allocated towards the bottom of the frame + // (i.e. near the stack pointer after allocating the frame). Every + // funclet needs a copy of this slot in its (mostly empty) frame, and the + // offset from the bottom of this and each funclet's frame must be the + // same, so the size of funclets' (mostly empty) frames is dictated by + // how far this slot is from the bottom (since they allocate just enough + // space to accomodate holding this slot at the correct offset). + int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -2777,9 +2994,10 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } /// Emit a load of return address if tail call @@ -2813,11 +3031,24 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack(NewReturnAddrFI), + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), NewReturnAddrFI), false, false, 0); return Chain; } +/// Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -2835,11 +3066,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - StructReturnType SR = callIsStructReturn(Outs); + StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + if (Attr.getValueAsString() == "true") isTailCall = false; @@ -2878,7 +3112,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ++NumTailCalls; } - assert(!(isVarArg && IsTailCallConvention(CallConv)) && + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. @@ -2892,13 +3126,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && - IsTailCallConvention(CallConv)) + canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2970,7 +3204,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && - Arg.getValueType().getScalarType() == MVT::i1) + Arg.getValueType().getVectorElementType() == MVT::i1) Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. @@ -2987,9 +3221,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); - Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(FI), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0); Arg = SpillSlot; break; } @@ -3125,10 +3360,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Flags, DAG, dl)); } else { // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains2.push_back(DAG.getStore( + ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0)); } } @@ -3207,7 +3442,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ExtraLoad) Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, + false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -3261,9 +3497,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); - // If this is an invoke in a 32-bit function using an MSVC personality, assume - // the function clobbers all registers. If an exception is thrown, the runtime - // will not restore CSRs. + // If this is an invoke in a 32-bit function using a funclet-based + // personality, assume the function clobbers all registers. If an exception + // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { @@ -3272,7 +3508,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallerFn->hasPersonalityFn() ? classifyEHPersonality(CallerFn->getPersonalityFn()) : EHPersonality::Unknown; - if (isMSVCEHPersonality(Pers)) + if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } @@ -3300,7 +3536,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && + else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee @@ -3358,8 +3594,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // EDI // local1 .. -/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned -/// for a 16 byte align requirement. +/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align +/// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { @@ -3380,9 +3616,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, return Offset; } -/// MatchingStackOffset - Return true if the given stack call argument is -/// already available in the same position (relatively) of the caller's -/// incoming argument stack. +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, @@ -3435,25 +3670,19 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } -/// IsEligibleForTailCallOptimization - Check whether the call is eligible -/// for tail call optimization. Targets which want to do tail call -/// optimization should implement this function. -bool -X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - bool isCalleeStructRet, - bool isCallerStructRet, - Type *RetTy, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - SelectionDAG &DAG) const { - if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) +/// Check whether the call is eligible for tail call optimization. Targets +/// that want to do tail call optimization should implement this function. +bool X86TargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. - const MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, @@ -3474,7 +3703,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) + if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } @@ -3493,19 +3722,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // An stdcall/thiscall caller is expected to clean up its arguments; the - // callee isn't going to do that. - // FIXME: this is more restrictive than needed. We could produce a tailcall - // when the stack adjustment matches. For example, with a thiscall that takes - // only one argument. - if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || - CallerCC == CallingConv::X86_ThisCall)) - return false; - // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { - // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) @@ -3573,6 +3792,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + unsigned StackArgsSize = 0; + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -3587,11 +3808,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); - if (CCInfo.getNextStackOffset()) { - MachineFunction &MF = DAG.getMachineFunction(); - if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) - return false; + StackArgsSize = CCInfo.getNextStackOffset(); + if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -3642,6 +3861,21 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + bool CalleeWillPop = + X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt); + + if (unsigned BytesToPop = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { + // If we have bytes to pop, the callee must pop them. + bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; + if (!CalleePopMatches) + return false; + } else if (CalleeWillPop && StackArgsSize > 0) { + // If we don't have bytes to pop, make sure the callee doesn't pop any. + return false; + } + return true; } @@ -3688,11 +3922,13 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: return true; } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3707,7 +3943,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3772,23 +4008,23 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, return false; } -/// isCalleePop - Determines whether the callee is required to pop its -/// own arguments. Callee pop is necessary to support tail calls. +/// Determines whether the callee is required to pop its own arguments. +/// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, - bool is64Bit, bool IsVarArg, bool TailCallOpt) { + bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { + // If GuaranteeTCO is true, we force some calls to be callee pop so that we + // can guarantee TCO. + if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) + return true; + switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: return !is64Bit; - case CallingConv::Fast: - case CallingConv::GHC: - case CallingConv::HiPE: - if (IsVarArg) - return false; - return TailCallOpt; } } @@ -3807,11 +4043,26 @@ static bool isX86CCUnsigned(unsigned X86CC) { case X86::COND_BE: return true; case X86::COND_AE: return true; } - llvm_unreachable("covered switch fell through?!"); } -/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 -/// specific condition code, returning the condition code and the LHS/RHS of the +static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } +} + +/// Do a one-to-one translation of a ISD::CondCode to the X86-specific +/// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { @@ -3833,19 +4084,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } - switch (SetCCOpcode) { - default: llvm_unreachable("Invalid integer condition!"); - case ISD::SETEQ: return X86::COND_E; - case ISD::SETGT: return X86::COND_G; - case ISD::SETGE: return X86::COND_GE; - case ISD::SETLT: return X86::COND_L; - case ISD::SETLE: return X86::COND_LE; - case ISD::SETNE: return X86::COND_NE; - case ISD::SETULT: return X86::COND_B; - case ISD::SETUGT: return X86::COND_A; - case ISD::SETULE: return X86::COND_BE; - case ISD::SETUGE: return X86::COND_AE; - } + return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. @@ -3898,8 +4137,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } -/// hasFPCMov - is there a floating point cmov for the specific X86 condition -/// code. Current x86 isa includes the following FP cmov instructions: +/// Is there a floating point cmov for the specific X86 condition code? +/// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { @@ -3917,7 +4156,7 @@ static bool hasFPCMov(unsigned X86CC) { } } -/// isFPImmLegal - Returns true if the target can instruction select the +/// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { @@ -3970,7 +4209,7 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasLZCNT(); } -/// isUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) @@ -3979,19 +4218,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { return true; } -/// isUndefOrInRange - Return true if Val is undef or if its value falls within -/// the specified range (L, H]. +/// Return true if Val is undef or if its value falls within the +/// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrEqual - Val is either less than zero (undef) or equal to the -/// specified value. +/// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } -/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, @@ -4002,9 +4240,8 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// isVEXTRACTIndex - Return true if the specified -/// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for instruction that extract 128 or 256 bit vectors +/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector +/// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4021,7 +4258,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { return Result; } -/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR +/// Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { @@ -4057,8 +4294,8 @@ bool X86::isVEXTRACT256Index(SDNode *N) { static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - llvm_unreachable("Illegal extract subvector for VEXTRACT"); + assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && + "Illegal extract subvector for VEXTRACT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); @@ -4072,8 +4309,8 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - llvm_unreachable("Illegal insert subvector for VINSERT"); + assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && + "Illegal insert subvector for VINSERT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); @@ -4085,53 +4322,71 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { return Index / NumElemsPerChunk; } -/// getExtractVEXTRACT128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } -/// getExtractVEXTRACT256Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } -/// getInsertVINSERT128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } -/// getInsertVINSERT256Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// isZero - Returns true if Elt is a constant integer zero -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isNullValue(); -} - -/// isZeroNode - Returns true if Elt is a constant zero or a floating point -/// constant +0.0. +/// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (isZero(Elt)) - return true; - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) - return CFP->getValueAPF().isPosZero(); - return false; + return isNullConstant(Elt) || isNullFPConstant(Elt); } -/// getZeroVector - Returns a vector of specified type with all zero elements. -/// -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, +// Build a vector of constants +// Use an UNDEF node if MaskElt == -1. +// Spilt 64-bit constants in the 32-bit mode. +static SDValue getConstVector(ArrayRef<int> Values, MVT VT, + SelectionDAG &DAG, + SDLoc dl, bool IsMask = false) { + + SmallVector<SDValue, 32> Ops; + bool Split = false; + + MVT ConstVecVT = VT; + unsigned NumElts = VT.getVectorNumElements(); + bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); + if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { + ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); + Split = true; + } + + MVT EltVT = ConstVecVT.getVectorElementType(); + for (unsigned i = 0; i < NumElts; ++i) { + bool IsUndef = Values[i] < 0 && IsMask; + SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(Values[i], dl, EltVT); + Ops.push_back(OpNode); + if (Split) + Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(0, dl, EltVT)); + } + SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); + if (Split) + ConstsNode = DAG.getBitcast(VT, ConstsNode); + return ConstsNode; +} + +/// Returns a vector of specified type with all zero elements. +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4163,7 +4418,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); - } else if (VT.getScalarType() == MVT::i1) { + } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); @@ -4195,19 +4450,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) - * ElemsPerChunk); + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + NormalizedIdxVal, - ElemsPerChunk)); + makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } @@ -4245,13 +4499,13 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) - * ElemsPerChunk); + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -4279,7 +4533,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, Vec, ZeroIndex); // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); + MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); if (ScalarType.isFloatingPoint()) { // Choose either vblendps (float) or vblendpd (double). unsigned ScalarSize = ScalarType.getSizeInBits(); @@ -4316,6 +4570,81 @@ static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +/// Insert i1-subvector to i1-vector. +static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (IdxVal == 0 && Vec.isUndef()) // the operation is legal + return Op; + + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + unsigned NumElems = OpVT.getVectorNumElements(); + unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); + + assert(IdxVal + SubVecNumElems <= NumElems && + IdxVal % SubVecVT.getSizeInBits() == 0 && + "Unexpected index value in INSERT_SUBVECTOR"); + + // There are 3 possible cases: + // 1. Subvector should be inserted in the lower part (IdxVal == 0) + // 2. Subvector should be inserted in the upper part + // (IdxVal + SubVecNumElems == NumElems) + // 3. Subvector should be inserted in the middle (for example v2i1 + // to v16i1, index 2) + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + SDValue WideSubVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); + if (Vec.isUndef()) + return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; + } + + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + + // Simple case when we put subvector in the upper part + if (IdxVal + SubVecNumElems == NumElems) { + // Zero upper bits of the Vec + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + // Subvector should be inserted in the middle - use shuffle + SmallVector<int, 64> Mask; + for (unsigned i = 0; i < NumElems; ++i) + Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? + i : i + NumElems); + return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); +} + /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower @@ -4334,18 +4663,22 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -/// getOnesVector - Returns a vector of specified type with all bits set. +/// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, - SDLoc dl) { +static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; - if (VT.is256BitVector()) { - if (HasInt256) { // AVX2 + if (VT.is512BitVector()) { + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.is256BitVector()) { + if (Subtarget->hasInt256()) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX @@ -4360,19 +4693,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, return DAG.getBitcast(VT, Vec); } -/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd -/// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2) { - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask; - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) - Mask.push_back(i); - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); -} - -/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. +/// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4384,7 +4705,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. +/// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4396,10 +4717,10 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified -/// vector of zero or undef vector. This produces a shuffle where the low -/// element of V2 is swizzled into the zero/undef vector, landing at element -/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +/// Return a vector_shuffle of the specified vector of zero or undef vector. +/// This produces a shuffle where the low element of V2 is swizzled into the +/// zero/undef vector, landing at element Idx. +/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, @@ -4415,10 +4736,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } -/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. Sets -/// IsUnary to true if only uses one source. Note that this will set IsUnary for -/// shuffles which use a single input multiple times, and in those cases it will +/// Calculates the shuffle mask corresponding to the target-specific opcode. +/// Returns true if the Mask could be calculated. Sets IsUnary to true if only +/// uses one source. Note that this will set IsUnary for shuffles which use a +/// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. /// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. static bool getTargetShuffleMask(SDNode *N, MVT VT, @@ -4482,7 +4803,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. - EVT VT = MaskNode.getValueType(); + MVT VT = MaskNode.getSimpleValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); if (!VT.isInteger()) @@ -4572,6 +4893,119 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLPS: // Not yet implemented return false; + case X86ISD::VPERMV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(0); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); + SmallVector<uint64_t, 32> RawMask; + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else if (isa<ConstantSDNode>(Op)) { + APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } else + return false; + } + DecodeVPERMVMask(RawMask, Mask); + break; + } + if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { + unsigned NumEltsInMask = MaskNode->getNumOperands(); + MaskNode = MaskNode->getOperand(0); + if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) { + APInt MaskEltValue = CN->getAPIntValue(); + for (unsigned i = 0; i < NumEltsInMask; ++i) + RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); + DecodeVPERMVMask(RawMask, Mask); + break; + } + // It may be a scalar load + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMVMask(C, VT, Mask); + if (Mask.empty()) + return false; + break; + } + return false; + } + case X86ISD::VPERMV3: { + IsUnary = false; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(1); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + SmallVector<uint64_t, 32> RawMask; + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else { + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } + } + DecodeVPERMV3Mask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMV3Mask(C, VT, Mask); + if (Mask.empty()) + return false; + break; + } + return false; + } default: llvm_unreachable("unknown target shuffle node"); } @@ -4586,7 +5020,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, return true; } -/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { @@ -4650,8 +5084,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. -/// +/// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4721,8 +5154,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, return DAG.getBitcast(MVT::v16i8, V); } -/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. -/// +/// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4753,7 +5185,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } -/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +/// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { @@ -4924,7 +5356,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); - int64_t StartOffset = Offset & ~(RequiredAlign-1); + int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -5157,8 +5589,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -5188,9 +5619,10 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5329,7 +5761,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return NV; } -static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) { +static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); @@ -5366,7 +5798,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - SDValue Imm = ConvertI1VectorToInterger(Op, DAG); + SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); @@ -5600,7 +6032,7 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, /// node. static SDValue LowerToAddSub(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = BV->getValueType(0); + MVT VT = BV->getSimpleValueType(0); if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); @@ -5662,12 +6094,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) { InVec0 = Op0.getOperand(0); - if (InVec0.getValueType() != VT) + if (InVec0.getSimpleValueType() != VT) return SDValue(); } if (InVec1.getOpcode() == ISD::UNDEF) { InVec1 = Op1.getOperand(0); - if (InVec1.getValueType() != VT) + if (InVec1.getSimpleValueType() != VT) return SDValue(); } @@ -5703,7 +6135,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = BV->getValueType(0); + MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; @@ -5845,7 +6277,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. - if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) + if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later @@ -5866,7 +6298,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Op; if (!VT.is512BitVector()) - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + return getOnesVector(VT, Subtarget, DAG, dl); } BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); @@ -5881,7 +6313,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumZero = 0; unsigned NumNonZero = 0; - unsigned NonZeros = 0; + uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet<SDValue, 8> Values; for (unsigned i = 0; i < NumElems; ++i) { @@ -5895,7 +6327,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (X86::isZeroNode(Elt)) NumZero++; else { - NonZeros |= (1 << i); + assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. + NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } @@ -5919,7 +6352,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); - EVT VecVT = MVT::v4i32; + MVT VecVT = MVT::v4i32; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). @@ -6051,7 +6484,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, - Op.getOperand(Idx)); + Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); @@ -6059,13 +6492,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) - if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; if (EVTBits == 16 && NumElems == 8) - if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS @@ -6077,7 +6510,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -6177,7 +6610,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -6193,8 +6626,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), + ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), @@ -6213,8 +6646,27 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + // Specialize the cases when all, or all but one, of the operands are undef. + unsigned NumOfDefinedOps = 0; + unsigned OpIdx = 0; + for (unsigned i = 0; i < NumOfOperands; i++) + if (!Op.getOperand(i).isUndef()) { + NumOfDefinedOps++; + OpIdx = i; + } + if (NumOfDefinedOps == 0) + return Undef; + if (NumOfDefinedOps == 1) { + unsigned SubVecNumElts = + Op.getOperand(OpIdx).getValueType().getVectorNumElements(); + SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, + Op.getOperand(OpIdx), IdxVal); + } + + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SmallVector<SDValue, 2> Ops; for (unsigned i = 0; i < NumOfOperands/2; i++) @@ -6227,31 +6679,38 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } + // 2 operands SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + assert(V1.getValueType() == V2.getValueType() && + V1.getValueType().getVectorNumElements() == NumElems/2 && + "Unexpected operands in CONCAT_VECTORS"); + + if (ResVT.getSizeInBits() >= 16) + return Op; // The operation is legal with KUNPCK + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); - + SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); if (IsZeroV1 && IsZeroV2) - return getZeroVector(ResVT, Subtarget, DAG, dl); + return ZeroVec; SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResVT); - unsigned NumElems = ResVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); + if (V2.isUndef()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + if (IsZeroV2) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); + + SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); + if (V1.isUndef()) + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); - V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); if (IsZeroV1) - return V2; + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); - // Zero the upper bits of V1 - V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); - V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); - if (IsZeroV2) - return V1; - return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); } static SDValue LowerCONCAT_VECTORS(SDValue Op, @@ -6272,7 +6731,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, return LowerAVXCONCAT_VECTORS(Op, DAG); } - //===----------------------------------------------------------------------===// // Vector shuffle lowering // @@ -6422,6 +6880,127 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, return DAG.getConstant(Imm, DL, MVT::i8); } +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +// X86 has dedicated unpack instructions that can handle specific blend +// operations: UNPCKH and UNPCKL. +static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + SmallVector<int, 8> Unpckl; + SmallVector<int, 8> Unpckh; + + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); + int HiPos = LoPos + NumEltsInLane / 2; + Unpckl.push_back(LoPos); + Unpckh.push_back(HiPos); + } + + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + + // Commute and try again. + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); + + return SDValue(); +} + +/// \brief Try to emit a bitmask instruction for a shuffle. +/// +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getVectorElementType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, DL, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getBitcast(EltVT, Zero); + AllOnes = DAG.getBitcast(EltVT, AllOnes); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} + /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are @@ -6431,7 +7010,7 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); - MVT EltVT = VT.getScalarType(); + MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, @@ -6458,22 +7037,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is in fact a blend. +/// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, + SDValue V2, ArrayRef<int> Original, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 8> Mask(Original.begin(), Original.end()); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + bool ForceV1Zero = false, ForceV2Zero = false; + + // Attempt to generate the binary blend mask. If an input is zero then + // we can use any lane. + // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] >= Size) { - if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input! + int M = Mask[i]; + if (M < 0) + continue; + if (M == i) + continue; + if (M == i + Size) { BlendMask |= 1u << i; continue; } - if (Mask[i] >= 0 && Mask[i] != i) - return SDValue(); // Shuffled V1 input! + if (Zeroable[i]) { + if (V1IsZero) { + ForceV1Zero = true; + Mask[i] = i; + continue; + } + if (V2IsZero) { + ForceV2Zero = true; + BlendMask |= 1u << i; + Mask[i] = i + Size; + continue; + } + } + return SDValue(); // Shuffled input! } + + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + + auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { + unsigned ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1u << i)) + for (int j = 0; j != Scale; ++j) + ScaledMask |= 1u << (i * Scale + j); + return ScaledMask; + }; + switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: @@ -6493,12 +7112,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, if (Subtarget->hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); @@ -6511,12 +7125,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, @@ -6541,9 +7150,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { - assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && + assert((VT.is128BitVector() || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -6760,11 +7373,11 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, Hi = DAG.getBitcast(AlignVT, Hi); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, + VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } - assert(VT.getSizeInBits() == 128 && + assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); @@ -6785,92 +7398,6 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } -/// \brief Compute whether each element of a shuffle is zeroable. -/// -/// A "zeroable" vector shuffle element is one which can be lowered to zero. -/// Either it is an undef element in the shuffle mask, the element of the input -/// referenced is undef, or the element of the input referenced is known to be -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle -/// as many lanes with this technique as possible to simplify the remaining -/// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); - - while (V1.getOpcode() == ISD::BITCAST) - V1 = V1->getOperand(0); - while (V2.getOpcode() == ISD::BITCAST) - V2 = V2->getOperand(0); - - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; - continue; - } - - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. - SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) - continue; - - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) - Zeroable[i] = true; - } - - return Zeroable; -} - -/// \brief Try to emit a bitmask instruction for a shuffle. -/// -/// This handles cases where we can model a blend exactly as a bitmask due to -/// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { - MVT EltVT = VT.getScalarType(); - int NumEltBits = EltVT.getSizeInBits(); - MVT IntEltVT = MVT::getIntegerVT(NumEltBits); - SDValue Zero = DAG.getConstant(0, DL, IntEltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - IntEltVT); - if (EltVT.isFloatingPoint()) { - Zero = DAG.getBitcast(EltVT, Zero); - AllOnes = DAG.getBitcast(EltVT, AllOnes); - } - SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - SDValue V; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Zeroable[i]) - continue; - if (Mask[i] % Size != i) - return SDValue(); // Not a blend. - if (!V) - V = Mask[i] < Size ? V1 : V2; - else if (V != (Mask[i] < Size ? V1 : V2)) - return SDValue(); // Can only let one input through the mask. - - VMaskOps[i] = AllOnes; - } - if (!V) - return SDValue(); // No non-zeroable elements! - - SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); - V = DAG.getNode(VT.isFloatingPoint() - ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, - DL, VT, V, VMask); - return V; -} - /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -6982,7 +7509,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; - for (; Len >= 0; --Len) + for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); @@ -6997,8 +7524,9 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, SDValue &V = (M < Size ? V1 : V2); M = M % Size; - // All mask elements must be in the lower half. - if (M > HalfSize) + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) return SDValue(); if (Idx < 0 || (Src == V && Idx == (M - i))) { @@ -7095,64 +7623,104 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available -/// features of the subtarget. +/// features of the subtarget. The extended elements are consecutive and +/// begin and can start from an offseted element index in the input; to +/// avoid excess shuffling the offset must either being in the bottom lane +/// or at the start of a higher lane. All extended elements must be from +/// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( - SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, + SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); - int NumElements = VT.getVectorNumElements(); int EltBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = 128 / EltBits; + int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + assert(0 <= Offset && "Extension offset must be positive."); + assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && + "Extension offset must be in the first lane or start an upper lane."); + + // Check that an index is in same lane as the base offset. + auto SafeOffset = [&](int Idx) { + return OffsetLane == (Idx / NumEltsPerLane); + }; + + // Shift along an input so that the offset base moves to the first element. + auto ShuffleOffset = [&](SDValue V) { + if (!Offset) + return V; + + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = 0; i * Scale < NumElements; ++i) { + int SrcIdx = i + Offset; + ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; + } + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); + }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { + // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // PUNPCK will catch this in a later shuffle match. + if (Offset && Scale == 2 && VT.is128BitVector()) + return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); + return DAG.getBitcast(VT, InputV); } + assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); + // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { - int PSHUFDMask[4] = {0, -1, 1, -1}; + int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, + -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { - int PSHUFDMask[4] = {0, -1, 0, -1}; + int PSHUFDMask[4] = {Offset / 2, -1, + SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); - int PSHUFHWMask[4] = {1, -1, -1, -1}; + int PSHUFWMask[4] = {1, -1, -1, -1}; + unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), - getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); + getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); - assert(VT.getSizeInBits() == 128 && "Unexpected vector width!"); + assert(VT.is128BitVector() && "Unexpected vector width!"); + int LoIdx = Offset * EltBits; SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(0, DL, MVT::i8))); - if (isUndefInRange(Mask, NumElements/2, NumElements/2)) + DAG.getConstant(LoIdx, DL, MVT::i8))); + + if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || + !SafeOffset(Offset + 1)) return DAG.getNode(ISD::BITCAST, DL, VT, Lo); - SDValue Hi = - DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(EltBits, DL, MVT::i8))); + int HiIdx = (Offset + 1) * EltBits; + SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } @@ -7163,9 +7731,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; - for (int i = 0; i < 16; ++i) - PSHUFBMask[i] = - DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8); + for (int i = 0; i < 16; ++i) { + int Idx = Offset + (i / Scale); + PSHUFBMask[i] = DAG.getConstant( + (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); + } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast(VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, @@ -7173,13 +7743,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT::v16i8, PSHUFBMask))); } + // If we are extending from an offset, ensure we start on a boundary that + // we can unpack from. + int AlignToUnpack = Offset % (NumElements / Scale); + if (AlignToUnpack) { + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = AlignToUnpack; i < NumElements; ++i) + ShMask[i - AlignToUnpack] = i; + InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); + Offset -= AlignToUnpack; + } + // Otherwise emit a sequence of unpacks. do { + unsigned UnpackLoHi = X86ISD::UNPCKL; + if (Offset >= (NumElements / 2)) { + UnpackLoHi = X86ISD::UNPCKH; + Offset -= (NumElements / 2); + } + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); - InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; @@ -7205,7 +7792,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); + int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); @@ -7215,8 +7804,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; + int Offset = 0; + int Matches = 0; for (int i = 0; i < NumElements; ++i) { - if (Mask[i] == -1) + int M = Mask[i]; + if (M == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. @@ -7230,14 +7822,29 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( // Each of the base elements needs to be consecutive indices into the // same input vector. - SDValue V = Mask[i] < NumElements ? V1 : V2; - if (!InputV) + SDValue V = M < NumElements ? V1 : V2; + M = M % NumElements; + if (!InputV) { InputV = V; - else if (InputV != V) + Offset = M - (i / Scale); + } else if (InputV != V) return SDValue(); // Flip-flopping inputs. - if (Mask[i] % NumElements != i / Scale) + // Offset must start in the lowest 128-bit lane or at the start of an + // upper lane. + // FIXME: Is it ever worth allowing a negative base offset? + if (!((0 <= Offset && Offset < NumEltsPerLane) || + (Offset % NumEltsPerLane) == 0)) + return SDValue(); + + // If we are offsetting, all referenced entries must come from the same + // lane. + if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) + return SDValue(); + + if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. + Matches++; } // If we fail to find an input, we have a zero-shuffle which should always @@ -7246,8 +7853,13 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (!InputV) return SDValue(); + // If we are offsetting, don't extend if we only match a single input, we + // can always do better by using a basic PSHUF or PUNPCK. + if (Offset != 0 && Matches < 2) + return SDValue(); + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG); + DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7355,8 +7967,9 @@ static SDValue lowerVectorShuffleAsElementInsertion( // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - if (SDValue V2S = getScalarValueForVectorElement( - V2, Mask[V2Index] - Mask.size(), DAG)) { + SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), + DAG); + if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { @@ -7431,11 +8044,65 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } +/// \brief Try to lower broadcast of a single - truncated - integer element, +/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. +/// +/// This assumes we have AVX2. +static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX2() && + "We can only lower integer broadcasts with AVX2!"); + + EVT EltVT = VT.getVectorElementType(); + EVT V0VT = V0.getValueType(); + + assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); + assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); + + EVT V0EltVT = V0VT.getVectorElementType(); + if (!V0EltVT.isInteger()) + return SDValue(); + + const unsigned EltSize = EltVT.getSizeInBits(); + const unsigned V0EltSize = V0EltVT.getSizeInBits(); + + // This is only a truncation if the original element type is larger. + if (V0EltSize <= EltSize) + return SDValue(); + + assert(((V0EltSize % EltSize) == 0) && + "Scalar type sizes must all be powers of 2 on x86!"); + + const unsigned V0Opc = V0.getOpcode(); + const unsigned Scale = V0EltSize / EltSize; + const unsigned V0BroadcastIdx = BroadcastIdx / Scale; + + if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && + V0Opc != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue Scalar = V0.getOperand(V0BroadcastIdx); + + // If we're extracting non-least-significant bits, shift so we can truncate. + // Hopefully, we can fold away the trunc/srl/load into the broadcast. + // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer + // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. + if (const int OffsetIdx = BroadcastIdx % Scale) + Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, + DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); +} + /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef<int> Mask, const X86Subtarget *Subtarget, @@ -7476,7 +8143,7 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = - BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; @@ -7491,6 +8158,15 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) + if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( + DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + return TruncBroadcast; + + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); @@ -7499,6 +8175,20 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); + } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast<LoadSDNode>(V); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. @@ -7595,9 +8285,10 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(!isSingleInputShuffleMask(Mask) && @@ -7774,10 +8465,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, @@ -7869,10 +8559,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8077,14 +8766,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); @@ -8161,14 +8845,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8184,8 +8863,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, DAG); // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, + V2, Mask, DAG)) return Unpack; // We implement this with SHUFPS because it can blend from two vectors. @@ -8218,7 +8897,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, static SDValue lowerV8I16GeneralSingleInputVectorShuffle( SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); + assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); @@ -8286,16 +8965,18 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + bool ThreeAInputs = AToAInputs.size() == 3; + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; - int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; - int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; - int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; - ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; - int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int &TripleDWord = ThreeAInputs ? ADWord : BDWord; + int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; + int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; + ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; + int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); @@ -8364,8 +9045,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); - int APinnedIdx = - AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } @@ -8751,10 +9431,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); - if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, @@ -8798,10 +9477,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -8812,8 +9490,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, + V2, Mask, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it @@ -9037,17 +9715,14 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; } + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 0, 16, 1, 17, 2, 18, 3, 19, - // High half. - 4, 20, 5, 21, 6, 22, 7, 23})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 8, 24, 9, 25, 10, 26, 11, 27, - // High half. - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -9086,8 +9761,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; } @@ -9296,7 +9971,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; - MVT ScalarVT = VT.getScalarType(); + MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build @@ -9308,7 +9983,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; - MVT OrigScalarVT = OrigVT.getScalarType(); + MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; @@ -9478,7 +10153,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, ArrayRef<int> Mask, SelectionDAG &DAG) { // FIXME: This should probably be generalized for 512-bit vectors as well. - assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be @@ -9682,6 +10357,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector<int, 8> HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -9776,16 +10553,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG); } - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) @@ -9876,14 +10647,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -9941,14 +10707,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -9974,9 +10735,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (Subtarget->hasAVX2()) return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v8i32, VPermMask)), - V1); + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10041,14 +10800,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; } // Try to use shift instructions. @@ -10115,18 +10869,9 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10215,22 +10960,9 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - // Note that these are repeated 128-bit lane unpacks, not unpacks across all - // 256-bit lanes. - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10296,12 +11028,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; - // There is a really nice hard cut-over between AVX1 and AVX2 that means we can - // check for those subtargets here and avoid much of the subtarget querying in - // the per-vector-type lowering routines. With AVX1 we have essentially *zero* - // ability to manipulate a 256-bit vector with integer types. Since we'll use - // floating point types there eventually, just immediately cast everything to - // a float and operate entirely in that domain. + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we + // can check for those subtargets here and avoid much of the subtarget + // querying in the per-vector-type lowering routines. With AVX1 we have + // essentially *zero* ability to manipulate a 256-bit vector with integer + // types. Since we'll use floating point types there eventually, just + // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget->hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) @@ -10334,6 +11071,57 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +/// \brief Try to lower a vector shuffle as a 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, + ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + assert(VT.getScalarSizeInBits() == 64 && + "Unexpected element type size for 128bit shuffle."); + + // To handle 256 bit vector requires VLX and most probably + // function lowerV2X128VectorShuffle() is better solution. + assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); + + SmallVector<int, 4> WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + + // Form a 128-bit permutation. + // Convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + unsigned ControlBitsNum = WidenedMask.size() / 2; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // Use first element in place of undef mask. + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + +static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, @@ -10345,21 +11133,21 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Shuf128; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); @@ -10367,22 +11155,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return Unpck; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. @@ -10396,21 +11173,21 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); @@ -10418,22 +11195,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return Unpck; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. @@ -10448,8 +11214,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. @@ -10517,6 +11282,60 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } +// Lower vXi1 vector shuffles. +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + MVT ExtVT; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Expected a vector of i1 elements"); + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + + if (ISD::isBuildVectorAllZeros(V1.getNode())) + V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V1.getNode())) + V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); + + if (V2.isUndef()) + V2 = DAG.getUNDEF(ExtVT); + else if (ISD::isBuildVectorAllZeros(V2.getNode())) + V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V2.getNode())) + V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); +} /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -10533,8 +11352,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); + bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + assert((VT.getSizeInBits() != 64 || Is1BitVector) && + "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -10572,7 +11393,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; - if (VT.getScalarSizeInBits() < 64 && + if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) @@ -10640,17 +11461,17 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, } // For each vector width, delegate to a specialized lowering routine. - if (VT.getSizeInBits() == 128) + if (VT.is128BitVector()) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - if (VT.getSizeInBits() == 256) + if (VT.is256BitVector()) return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - // Force AVX-512 vectors to be scalarized for now. - // FIXME: Implement AVX-512 support! - if (VT.getSizeInBits() == 512) + if (VT.is512BitVector()) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (Is1BitVector) + return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } @@ -10661,11 +11482,16 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + // We don't handle the >2 lanes case right now. unsigned NumLanes = (NumElems - 1) / 8 + 1; + if (NumLanes > 2) + return false; + unsigned NumElemsInLane = NumElems / NumLanes; - // Blend for v16i16 should be symetric for the both lanes. + // Blend for v16i16 should be symmetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { SDValue EltCond = BuildVector->getOperand(i); SDValue SndLaneEltCond = @@ -10673,20 +11499,25 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, int Lane1Cond = -1, Lane2Cond = -1; if (isa<ConstantSDNode>(EltCond)) - Lane1Cond = !isZero(EltCond); + Lane1Cond = !isNullConstant(EltCond); if (isa<ConstantSDNode>(SndLaneEltCond)) - Lane2Cond = !isZero(SndLaneEltCond); + Lane2Cond = !isNullConstant(SndLaneEltCond); + unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) // Lane1Cond != 0, means we want the first argument. // Lane1Cond == 0, means we want the second argument. // The encoding of this argument is 0 for the first argument, 1 // for the second. Therefore, invert the condition. - MaskValue |= !Lane1Cond << i; + LaneMask = !Lane1Cond << i; else if (Lane1Cond < 0) - MaskValue |= !Lane2Cond << i; + LaneMask = !Lane2Cond << i; else return false; + + MaskValue |= LaneMask; + if (NumLanes == 2) + MaskValue |= LaneMask << NumElemsInLane; } return true; } @@ -10711,7 +11542,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( - isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); + isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) + : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } @@ -10776,9 +11608,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { } if (VT.getSizeInBits() == 16) { - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode( ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, @@ -10801,8 +11632,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || - (isa<ConstantSDNode>(Op.getOperand(1)) && - cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && + isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); @@ -10900,10 +11730,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); - //if (IdxVal >= NumElems/2) - // IdxVal -= NumElems/2; - IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; + // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 + // this can be done with a mask. + IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, dl, MVT::i32)); } @@ -10918,8 +11749,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), @@ -10951,8 +11781,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return Op; // UNPCKHPD the element to the lowest double word, then movsd. @@ -11039,7 +11868,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); - unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; + assert(isPowerOf2_32(NumEltsIn128)); + // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. + unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, dl, MVT::i32)); @@ -11078,8 +11909,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - const Function *F = DAG.getMachineFunction().getFunction(); - bool MinSize = F->hasFnAttribute(Attribute::MinSize); + bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -11199,14 +12029,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector() && - !Subtarget->isUnalignedMem32Slow()) { - SDValue SubVec2 = Vec.getOperand(1); - if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { - if (Idx2->getZExtValue() == 0) { - SDValue Ops[] = { SubVec2, SubVec }; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + + if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } } } } @@ -11218,37 +12059,9 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - if (OpVT.getVectorElementType() == MVT::i1) { - if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal - return Op; - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(OpVT); - unsigned NumElems = OpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); - - if (IdxVal == OpVT.getVectorNumElements() / 2) { - // Zero upper bits of the Vec - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - if (IdxVal == 0) { - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - // Zero upper bits of the Vec2 - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); - // Zero lower bits of the Vec - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - // Merge them together - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - } + if (OpVT.getVectorElementType() == MVT::i1) + return Insert1BitVector(Op, DAG); + return SDValue(); } @@ -11363,7 +12176,8 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -11430,7 +12244,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -11587,7 +12402,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // The address of the thread local variable is the add of the thread @@ -11599,10 +12415,18 @@ SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + + // Cygwin uses emutls. + // FIXME: It may be EmulatedTLS-generic also for X86-Android. + if (Subtarget->isTargetWindowsCygwin()) + return LowerToTLSEmulatedModel(GA, DAG); + const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Subtarget->isTargetELF()) { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -11830,10 +12654,10 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + SDValue Chain = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, + false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } @@ -11855,10 +12679,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); - MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); @@ -11884,16 +12707,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, SSFISize, SSFISize); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); - Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, false, 0); + Result = DAG.getLoad( + Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + false, false, false, 0); } return Result; @@ -11937,16 +12760,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); - SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue CLod0 = + DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); - SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue CLod1 = + DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; @@ -11996,10 +12822,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. - EVT DestVT = Op.getValueType(); + MVT DestVT = Op.getSimpleValueType(); if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, @@ -12025,14 +12852,23 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; + // We shouldn't use it when unsafe-fp-math is enabled though: we might later + // reassociate the two FADDs, and if we do that, the algorithm fails + // spectacularly (PR24512). + // FIXME: If we ever have some kind of Machine FMF, this should be marked + // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because + // there's also the MachineCombiner reassociations happening on Machine IR. + if (DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + SDLoc DL(Op); SDValue V = Op->getOperand(0); - EVT VecIntVT = V.getValueType(); + MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; - EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. - if (VecFloatVT != Op->getValueType(0)) + if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); unsigned NumElts = VecIntVT.getVectorNumElements(); @@ -12070,7 +12906,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, SDValue Low, High; if (Subtarget.hasSSE41()) { - EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); @@ -12108,6 +12944,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); + // TODO: Are there any fast-math-flags to propagate here? SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; @@ -12137,11 +12974,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); case MVT::v16i8: case MVT::v16i16: - if (Subtarget->hasAVX512()) - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); + assert(Subtarget->hasAVX512()); + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } - llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -12150,7 +12986,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (Op.getValueType().isVector()) + if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't @@ -12161,6 +12997,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + + if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && + (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { + // Conversions from unsigned i32 to f32/f64 are legal, + // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. + return Op; + } + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) @@ -12193,10 +13037,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, 8, 8); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; @@ -12223,24 +13066,52 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? - SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), - FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, false, 4); + SDValue Fudge = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. + // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } +// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation +// is legal, or has an fp128 or f16 source (which needs to be promoted to f32), +// just return an <SDValue(), SDValue()> pair. +// Otherwise it is assumed to be a conversion from one of f32, f64 or f80 +// to i16, i32 or i64, and we lower it to a legal sequence. +// If lowered to the final integer result we return a <result, SDValue()> pair. +// Otherwise we lower it to a sequence ending with a FIST, return a +// <FIST, StackSlot> pair, and the caller is responsible for loading +// the final integer result from StackSlot. std::pair<SDValue,SDValue> -X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { +X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); + EVT TheVT = Op.getOperand(0).getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { + if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { + // f16 must be promoted before using the lowering in this routine. + // fp128 does not use this lowering. + return std::make_pair(SDValue(), SDValue()); + } + + // If using FIST to compute an unsigned i64, we'll need some fixup + // to handle values above the maximum signed i64. A FIST is always + // used for the 32-bit subtarget, but also for f80 on a 64-bit target. + bool UnsignedFixup = !IsSigned && + DstTy == MVT::i64 && + (!Subtarget->is64Bit() || + !isScalarFPTypeInSSEReg(TheVT)); + + if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { + // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. + // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } @@ -12258,42 +13129,87 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 either into FISTP64 followed by a load from a temporary - // stack slot, or into the FTOL runtime function. + // We lower FP->int64 into FISTP64 followed by a load from a temporary + // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; - if (!IsSigned && isIntegerTypeFTOL(DstTy)) - Opc = X86ISD::WIN_FTOL; - else - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); - EVT TheVT = Op.getOperand(0).getValueType(); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. + + if (UnsignedFixup) { + // + // Conversion to unsigned i64 is implemented with a select, + // depending on whether the source value fits in the range + // of a signed i64. Let Thresh be the FP equivalent of + // 0x8000000000000000ULL. + // + // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Fist-to-mem64 FistSrc + // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent + // to XOR'ing the high 32 bits with Adjust. + // + // Being a power of 2, Thresh is exactly representable in all FP formats. + // For X87 we'd like to use the smallest FP type for this constant, but + // for DAG type consistency we have to match the FP operand type. + + APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); + LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; + bool LosesInfo = false; + if (TheVT == MVT::f64) + // The rounding mode is irrelevant as the conversion should be exact. + Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + &LosesInfo); + else if (TheVT == MVT::f80) + Status = Thresh.convert(APFloat::x87DoubleExtended, + APFloat::rmNearestTiesToEven, &LosesInfo); + + assert(Status == APFloat::opOK && !LosesInfo && + "FP conversion should have been exact"); + + SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); + + SDValue Cmp = DAG.getSetCC(DL, + getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Adjust = DAG.getSelect(DL, MVT::i32, Cmp, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0x80000000, DL, MVT::i32)); + SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); + Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + } + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + MachinePointerInfo::getFixedStack(MF, SSFI), false, + false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); @@ -12301,28 +13217,52 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, } MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + if (UnsignedFixup) { + + // Insert the FIST, load its result as two i32's, + // and XOR the high i32 with Adjust. + + SDValue FistOps[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + FistOps, DstTy, MMO); + + SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0); + SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, + DAG.getConstant(4, DL, PtrVT)); - if (Opc != X86ISD::WIN_FTOL) { + SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, + MachinePointerInfo(), + false, false, false, 0); + High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); + + if (Subtarget->is64Bit()) { + // Join High32 and Low32 into a 64-bit result. + // (High32 << 32) | Low32 + Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); + High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); + High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, + DAG.getConstant(32, DL, MVT::i8)); + SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); + return std::make_pair(Result, SDValue()); + } + + SDValue ResultOps[] = { Low32, High32 }; + + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) + : DAG.getMergeValues(ResultOps, DL); + return std::make_pair(pair, SDValue()); + } else { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); - } else { - SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, - DAG.getVTList(MVT::Other, MVT::Glue), - Chain, Value); - SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, - MVT::i32, ftol.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, - MVT::i32, eax.getValue(2)); - SDValue Ops[] = { eax, edx }; - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) - : DAG.getMergeValues(Ops, DL); - return std::make_pair(pair, SDValue()); } } @@ -12333,7 +13273,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); - if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1) + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); // Optimize vectors in AVX mode: @@ -12426,6 +13366,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -12443,42 +13439,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // move vector to mask - truncate solution for SKX - if (VT.getVectorElementType() == MVT::i1) { - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) - return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVQ2M - } - if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { - if (VT.getVectorElementType().getSizeInBits() >=8) - return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); - - assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - unsigned NumElts = InVT.getVectorNumElements(); - assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); - if (InVT.getSizeInBits() < 512) { - MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - } - - SDValue OneV = - DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); - SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); - return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); - } + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (Subtarget->hasAVX512()) { + // word to byte only under BWI + if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 + return DAG.getNode(X86ISD::VTRUNC, DL, VT, + DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { @@ -12583,7 +13554,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) return Op; + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12600,7 +13572,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; - assert(FIST.getNode() && "Unexpected failure"); + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12643,6 +13617,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. @@ -12650,11 +13626,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { MVT LogicVT; MVT EltVT; unsigned NumElts; - + if (VT.isVector()) { LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -12675,9 +13656,10 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + SDValue Mask = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); @@ -12685,7 +13667,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - if (VT.isVector()) + if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, @@ -12704,6 +13686,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -12718,13 +13701,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector<Constant *, 4> CV( - VT == MVT::f64 ? 2 : 4, + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). @@ -12737,11 +13723,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; - SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); + SDValue Mask1 = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). @@ -12750,8 +13738,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -12761,18 +13750,21 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, PtrVT, 16); - SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue Val = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa<ConstantFPSDNode>(Op0)) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -12859,7 +13851,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } - EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) @@ -12999,14 +13991,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { + if (C->isOne() && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { + if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -13135,13 +14127,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { - if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, dl, DAG); + if (isNullConstant(Op1)) + return EmitTest(Op0, X86CC, dl, DAG); - if (Op0.getValueType() == MVT::i1) - llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); - } + assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && + "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { @@ -13150,8 +14140,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->optForMinSize() && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -13188,6 +14177,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -13261,13 +14253,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. -bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { - return NumUsers > 1; -} - -static bool isAllOnes(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isAllOnesValue(); +unsigned X86TargetLowering::combineRepeatedFPDivisors() const { + return 2; } /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node @@ -13285,8 +14272,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { - if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) - if (And00C->getZExtValue() == 1) { + if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); @@ -13423,7 +14409,7 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType() == MVT::i1 && + assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, @@ -13467,8 +14453,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && - Op.getValueType().getScalarType() == MVT::i1 && + assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); @@ -13515,7 +14501,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) for (unsigned i = 0; i < n; ++i) { ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); - if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) + if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) return SDValue(); // Avoid underflow. @@ -13606,13 +14592,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); - EVT OpVT = Op1.getValueType(); + MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); bool MaskResult = (VT.getVectorElementType() == MVT::i1); if (Subtarget->hasAVX512()) { - if (Op1.getValueType().is512BitVector() || + if (Op1.getSimpleValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); @@ -13628,6 +14614,33 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } + // Lower using XOP integer comparisons. + if ((VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { + // Translate compare code to XOP PCOM compare mode. + unsigned CmpMode = 0; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETULT: + case ISD::SETLT: CmpMode = 0x00; break; + case ISD::SETULE: + case ISD::SETLE: CmpMode = 0x01; break; + case ISD::SETUGT: + case ISD::SETGT: CmpMode = 0x02; break; + case ISD::SETUGE: + case ISD::SETGE: CmpMode = 0x03; break; + case ISD::SETEQ: CmpMode = 0x04; break; + case ISD::SETNE: CmpMode = 0x05; break; + } + + // Are we comparing unsigned or signed integers? + unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) + ? X86ISD::VPCOMU : X86ISD::VPCOM; + + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(CmpMode, dl, MVT::i8)); + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -13777,7 +14790,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); @@ -13818,11 +14831,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast<ConstantSDNode>(Op1)->isNullValue() && + isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; @@ -13831,17 +14842,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. - if (Op1.getOpcode() == ISD::Constant && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || - cast<ConstantSDNode>(Op1)->isNullValue()) && + if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast<ConstantSDNode>(Op1)->isNullValue(); + bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; @@ -13854,8 +14862,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } } - if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) && + if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); @@ -13876,6 +14883,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } +SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Carry = Op.getOperand(2); + SDValue Cond = Op.getOperand(3); + SDLoc DL(Op); + + assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); + + assert(Carry.getOpcode() != ISD::CARRY_FALSE); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); +} + // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); @@ -13918,7 +14942,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); - EVT VT = Op1.getValueType(); + MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops @@ -13927,7 +14951,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && - VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { + VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); int SSECC = translateX86FSETCC( cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); @@ -13961,12 +14985,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. - EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); - EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); @@ -13980,26 +15004,26 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - if (VT.isVector() && VT.getScalarType() == MVT::i1) { - SDValue Op1Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) - Op1Scalar = ConvertI1VectorToInterger(Op1, DAG); - else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) - Op1Scalar = Op1.getOperand(0); - SDValue Op2Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) - Op2Scalar = ConvertI1VectorToInterger(Op2, DAG); - else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) - Op2Scalar = Op2.getOperand(0); - if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); - if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, newSelect); - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, - DAG.getIntPtrConstant(0, DL)); + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, newSelect); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); } } @@ -14026,22 +15050,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && - isZero(Cond.getOperand(1).getOperand(1))) { + isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); - if ((isAllOnes(Op1) || isAllOnes(Op2)) && + if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { - SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb - if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) - if (YC->isNullValue() && - (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + if (isNullConstant(Y) && + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, DL, @@ -14061,11 +15084,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); - if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); - ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); - if (!N2C || !N2C->isNullValue()) + if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -14073,11 +15095,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14136,15 +15156,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - // Look pass the truncate if the high bits are known zero. + // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14166,11 +15185,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && - (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); - if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } @@ -14256,8 +15276,8 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, MVT InVT = In.getSimpleValueType(); assert(VT.getSizeInBits() == InVT.getSizeInBits()); - MVT InSVT = InVT.getScalarType(); - assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits()); + MVT InSVT = InVT.getVectorElementType(); + assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) return SDValue(); @@ -14276,7 +15296,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. - while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) { + while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); @@ -14286,7 +15306,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SDValue SignExt = Curr; if (CurrVT != InVT) { unsigned SignExtShift = - CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits(); + CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } @@ -14346,7 +15366,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); - MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); @@ -14470,7 +15490,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize / MemVT.getScalarType().getSizeInBits()); + loadRegZize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); @@ -14518,29 +15538,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, return Sext; } - // Otherwise we'll shuffle the small elements in the high bits of the - // larger type and perform an arithmetic shift. If the shift is not legal - // it's better to scalarize. - assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && - "We can't implement a sext load without an arithmetic right shift!"); - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; - - SDValue Shuff = DAG.getVectorShuffle( - WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); - - Shuff = DAG.getBitcast(RegVT, Shuff); - - // Build the arithmetic shift. - unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - - MemVT.getVectorElementType().getSizeInBits(); - Shuff = - DAG.getNode(ISD::SRA, dl, RegVT, Shuff, - DAG.getConstant(Amt, dl, RegVT)); + // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest + // lanes. + assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && + "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); + SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } @@ -14577,11 +15580,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (N1C && N1C->getAPIntValue() == 1) { + if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && - Op.getOperand(0).hasOneUse(); - } + Op.getOperand(0).hasOneUse(); return false; } @@ -14597,8 +15598,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && - isa<ConstantSDNode>(Cond.getOperand(1)) && - cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || @@ -14625,11 +15625,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14673,16 +15671,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } @@ -14844,8 +15840,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14877,54 +15872,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SplitStack; SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + + bool Is64Bit = Subtarget->is64Bit(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); + + SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDNode* Node = Op.getNode(); - unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" - " not tell us which reg is the stack pointer!"); + " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); - SDValue Tmp1 = SDValue(Node, 0); - SDValue Tmp2 = SDValue(Node, 1); SDValue Tmp3 = Node->getOperand(2); - SDValue Chain = Tmp1.getOperand(0); - - // Chain the dynamic stack allocation so that it doesn't modify the stack - // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), - SDLoc(Node)); - SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); - Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain - - Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), SDValue(), - SDLoc(Node)); - - SDValue Ops[2] = { Tmp1, Tmp2 }; - return DAG.getMergeValues(Ops, dl); - } - - // Get the inputs. - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - EVT VT = Op.getNode()->getValueType(0); - - bool Is64Bit = Subtarget->is64Bit(); - MVT SPTy = getPointerTy(DAG.getDataLayout()); - - if (SplitStack) { + Result = DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain + } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -14942,10 +15923,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); - SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); - SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); @@ -14967,9 +15946,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } - SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, dl); + Result = SP; } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {Result, Chain}; + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -14980,7 +15964,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); SDLoc DL(Op); - if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { + if (!Subtarget->is64Bit() || + Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); @@ -15019,10 +16004,11 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL)); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( + Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); - Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, - MachinePointerInfo(SV, 16), false, false, 0); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( + SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -15030,10 +16016,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); - assert((Subtarget->isTargetLinux() || - Subtarget->isTargetDarwin()) && - "Unhandled target in LowerVAARG"); assert(Op.getNode()->getNumOperands() == 4); + + MachineFunction &MF = DAG.getMachineFunction(); + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + // The Win64 ABI uses char* instead of a structure. + return DAG.expandVAArg(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); @@ -15061,8 +16050,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget->useSoftFloat() && - !(DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::NoImplicitFloat)) && + !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -15091,8 +16079,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, + // where a va_list is still an i8*. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + if (Subtarget->isCallingConvWin64( + DAG.getMachineFunction().getFunction()->getCallingConv())) + // Probably a Win64 va_copy. + return DAG.expandVACopy(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); @@ -15230,72 +16224,126 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); - EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -/// \brief Return (and \p Op, \p Mask) for compare instructions or -/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting for \p Mask when lowering masking intrinsics. -static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), - MVT::i1, VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); +/// \brief Return Mask with the necessary casting or extending +/// for \p Mask according to \p MaskVT when lowering masking intrinsics +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { - assert(MaskVT.isSimple() && "invalid mask type"); + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { + // Mask should be extended + Mask = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); + } - if (isAllOnes(Mask)) - return Op; + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); + + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); + } else { + // MaskVT require < 64bit. Truncate mask (should succeed in any case), + // and bitcast. + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); + } + + } else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } +} - switch (Op.getOpcode()) { - default: break; - case X86ISD::PCMPEQM: - case X86ISD::PCMPGTM: - case X86ISD::CMPM: - case X86ISD::CMPMU: - return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - } - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting or extending for \p Mask when lowering masking intrinsics +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + unsigned OpcodeSelect = ISD::VSELECT; + SDLoc dl(Op); + + if (isAllOnesConstant(Mask)) + return Op; + + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VFPCLASS: + case X86ISD::VFPCLASSS: + return DAG.getNode(ISD::OR, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is comming as MVT::i8 and it should be truncated +/// The mask is coming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using -/// "X86select" instead of "vselect". We just can't create the "vselect" node for -/// a scalar instruction. +/// "X86select" instead of "vselect". We just can't create the "vselect" node +/// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (isAllOnes(Mask)) - return Op; + if (isAllOnesConstant(Mask)) + return Op; - EVT VT = Op.getValueType(); - SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); + if (Op.getOpcode() == X86ISD::FSETCC) + return DAG.getNode(ISD::AND, dl, VT, Op, IMask); + if (Op.getOpcode() == X86ISD::VFPCLASS || + Op.getOpcode() == X86ISD::VFPCLASSS) + return DAG.getNode(ISD::OR, dl, VT, Op, IMask); + + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { @@ -15309,15 +16357,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) { case EHPersonality::MSVC_CXX: return 16; default: break; } - report_fatal_error("can only recover FP for MSVC EH personality functions"); + report_fatal_error( + "can only recover FP for 32-bit MSVC EH personality functions"); } -/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize -/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, @@ -15334,29 +16383,35 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, if (!Fn->hasPersonalityFn()) return EntryEBP; - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - // Get an MCSymbol that will ultimately resolve to the frame offset of the EH - // registration. + // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); - SDValue RegNodeFrameOffset = + SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after + // prologue to RBP in the parent function. + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + if (Subtarget.is64Bit()) + return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize - // ParentFP = RegNodeBase - RegNodeFrameOffset + // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); - return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { @@ -15365,6 +16420,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case INTR_TYPE_2OP_IMM8: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -15376,28 +16434,53 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode; + // We allways add rounding mode to the Node. + // If the rounding mode is not specified, we add the + // "current direction" mode. if (Op.getNumOperands() == 4) - RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + RoundingMode = + DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); else RoundingMode = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) + if (IntrWithRoundingModeOpcode != 0) + if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, RoundingMode), Mask, PassThru, Subtarget, DAG); - } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); - SDValue Passthru = Op.getOperand(2); + SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); + // We add rounding mode to the Node when + // - RM Opcode is specified and + // - RM is not "current direction". + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(4); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, Passthru, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); @@ -15405,7 +16488,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // There are 2 kinds of intrinsics in this group: - // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands + // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); @@ -15421,11 +16504,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } - case INTR_TYPE_2OP_MASK: { + case INTR_TYPE_2OP_MASK: + case INTR_TYPE_2OP_IMM8_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + + if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) + Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15440,8 +16528,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1,Src2), + // TODO: Intrinsics should have fast-math-flags to propagate. + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_RM: { @@ -15449,7 +16537,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // We specify 2 possible modes for intrinsics, with/without rounding modes. + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. // First, we check if the intrinsic have rounding mode (6 operands), // if not, we set rounding mode to "current". SDValue Rnd; @@ -15461,12 +16550,56 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_MASK: { + case INTR_TYPE_3OP_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2, Src3, Sae), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Imm = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. + // First, we check if the intrinsic have rounding mode (7 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 7) + Rnd = Op.getOperand(6); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Imm, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_IMM8_MASK: + case INTR_TYPE_3OP_MASK: + case INSERT_SUBVEC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + + if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) + Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); + else if (IntrData->Type == INSERT_SUBVEC) { + // imm should be adapted to ISD::INSERT_SUBVECTOR behavior + assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); + unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); + Imm *= Src2.getSimpleValueType().getVectorNumElements(); + Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); + } + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15486,7 +16619,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: - case VPERM_3OP_MASK: + case VPERM_3OP_MASK:{ + // Src2 is the PassThru + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else + PassThru = DAG.getBitcast(VT, Src2); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src1, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { @@ -15494,11 +16647,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element - if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; @@ -15523,6 +16676,50 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case TERLOG_OP_MASK: + case TERLOG_OP_MASKZ: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); + SDValue Mask = Op.getOperand(5); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = Src1; + // Set PassThru element. + if (IntrData->Type == TERLOG_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3, Src4), + Mask, PassThru, Subtarget, DAG); + } + case FPCLASS: { + // FPclass intrinsics with mask + SDValue Src1 = Op.getOperand(1); + MVT VT = Src1.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); + SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), FPclassMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case FPCLASSS: { + SDValue Src1 = Op.getOperand(1); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); + } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. @@ -15534,12 +16731,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) - EVT VT = Op.getOperand(1).getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); + MVT VT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { SDValue CC = Op.getOperand(3); @@ -15573,6 +16769,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } + case CMP_MASK_SCALAR_CC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); + SDValue Mask = Op.getOperand(4); + + SDValue Cmp; + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + } + //default rounding mode + if(!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + + SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, dl, + MVT::i1), + Subtarget, DAG); + + return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, + DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), + DAG.getValueType(MVT::i1)); + } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); @@ -15584,6 +16806,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast<ConstantSDNode>(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); @@ -15598,27 +16838,75 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnes(Mask)) // return data as is + if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, PassThru, Subtarget, DAG); } + case BROADCASTM: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + Mask = DAG.getBitcast(MaskVT, Mask); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); + } case BLEND: { SDValue Mask = Op.getOperand(3); - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CONVERT_MASK_TO_VEC: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask); + } + case BRCST_SUBVEC_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + EVT resVT = Passthru.getValueType(); + SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, + DAG.getUNDEF(resVT), Src, + DAG.getIntPtrConstant(0, dl)); + SDValue immVal; + if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) + immVal = DAG.getConstant(0x44, dl, MVT::i8); + else + immVal = DAG.getConstant(0, dl, MVT::i8); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + subVec, subVec, immVal), + Mask, Passthru, Subtarget, DAG); + } default: break; } @@ -15832,23 +17120,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - if (!C) - llvm_unreachable("Invalid scale type"); - unsigned ScaleVal = C->getZExtValue(); - if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) - llvm_unreachable("Valid scale values are 1, 2, 4, 8"); - + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = MVT::getVectorVT(MVT::i1, + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. @@ -15860,7 +17142,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) - Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; @@ -15871,25 +17153,19 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - if (!C) - llvm_unreachable("Invalid scale type"); - unsigned ScaleVal = C->getZExtValue(); - if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) - llvm_unreachable("Valid scale values are 1, 2, 4, 8"); - + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = MVT::getVectorVT(MVT::i1, + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. @@ -15907,12 +17183,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); @@ -16034,64 +17309,59 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); - const Function *Fn = MF.getFunction(); - SDLoc dl(Op); SDValue Chain = Op.getOperand(0); + SDValue RegNode = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EH registrations only live in functions using WinEH"); + + // Cast the operand to an alloca, and remember the frame index. + auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); + if (!FINode) + report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); + EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); + + // Return the chain operand without making any DAG nodes. + return Chain; +} - assert(Subtarget->getFrameLowering()->hasFP(MF) && - "using llvm.x86.seh.restoreframe requires a frame pointer"); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = TLI.getPointerTy(DAG.getDataLayout()); - - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); - unsigned SPReg = RegInfo->getStackRegister(); - unsigned SlotSize = RegInfo->getSlotSize(); +/// \brief Lower intrinsics for TRUNCATE_TO_MEM case +/// return truncate Store/MaskedStore Node +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + SelectionDAG &DAG, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); - // Get incoming EBP. - SDValue IncomingEBP = - DAG.getCopyFromReg(Chain, dl, FrameReg, VT); + MVT VT = DataToTruncate.getSimpleValueType(); + MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); - // SP is saved in the first field of every registration node, so load - // [EBP-RegNodeSize] into SP. - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, - DAG.getConstant(-RegNodeSize, dl, VT)); - SDValue NewSP = - DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, - false, VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); - - if (!RegInfo->needsStackRealignment(MF)) { - // Adjust EBP to point back to the original frame position. - SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); - } else { - assert(RegInfo->hasBasePointer(MF) && - "functions with Win32 EH must use frame or base pointer register"); + if (isAllOnesConstant(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); - // Reload the base pointer (ESI) with the adjusted incoming EBP. - SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); - // Reload the spilled EBP value, now that the stack and base pointers are - // set up. - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - X86FI->setHasSEHFramePtrSave(true); - int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); - X86FI->setSEHFramePtrSaveIndex(FI); - SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), - MachinePointerInfo(), false, false, false, - VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); - } + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); - return Chain; + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -16100,16 +17370,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) - return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); + if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + return MarkEHRegistrationNode(Op, DAG); return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { - default: - llvm_unreachable("Unknown Intrinsic Type"); - break; + default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. @@ -16214,8 +17482,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - EVT VT = DataToCompress.getValueType(); - if (isAllOnes(Mask)) // return just a store + MVT VT = DataToCompress.getSimpleValueType(); + if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); @@ -16227,15 +17495,21 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); } + case TRUNCATE_TO_MEM_VI8: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); + case TRUNCATE_TO_MEM_VI16: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); + case TRUNCATE_TO_MEM_VI32: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a load + if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); @@ -16359,6 +17633,21 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } +unsigned X86TargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; + + return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; +} + +unsigned X86TargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Funclet personalities don't use selectors (the runtime does the selection). + assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; +} + SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); @@ -16497,9 +17786,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.hasAttribute(Idx, Attribute::InReg)) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) { + auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. - InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; + InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; + } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" @@ -16588,8 +17879,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, 2, 2); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, @@ -16623,12 +17914,75 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { +/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. +// +// 1. i32/i64 128/256-bit vector (native support require VLX) are expended +// to 512-bit vector. +// 2. i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - EVT OpVT = VT; + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (EltVT == MVT::i64 || EltVT == MVT::i32) { + // Extend to 512 bit vector. + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unsupported value type for operation"); + + MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, + DAG.getUNDEF(NewVT), + Op.getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, + DAG.getIntPtrConstant(0, dl)); + } + + assert((EltVT == MVT::i8 || EltVT == MVT::i16) && + "Unsupported element type"); + + if (16 < NumElems) { + // Split vector, it's Lo and Hi parts will be handled in next iteration. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); + MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); + + Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); + Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + + MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); + + assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && + "Unsupported value type for operation"); + + // Use native supported vector instruction vplzcntd. + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); + SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); + SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); + + return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); +} + +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); + if (VT.isVector() && Subtarget->hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -16658,7 +18012,8 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -16686,13 +18041,39 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - unsigned NumBits = VT.getSizeInBits(); + unsigned NumBits = VT.getScalarSizeInBits(); SDLoc dl(Op); - Op = Op.getOperand(0); + + if (VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + SDValue N0 = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + + // lsb(x) = (x & -x) + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, + DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); + + // cttz_undef(x) = (width - 1) - ctlz(lsb) + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && + TLI.isOperationLegal(ISD::CTLZ, VT)) { + SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, + DAG.getNode(ISD::CTLZ, dl, VT, LSB)); + } + + // cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getConstant(1, dl, VT); + return DAG.getNode(ISD::CTPOP, dl, VT, + DAG.getNode(ISD::SUB, dl, VT, LSB, One)); + } + + assert(Op.getOpcode() == ISD::CTTZ && + "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); - Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { @@ -16753,6 +18134,13 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } +static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -16885,7 +18273,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue AhiBlo = Ahi; SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ - EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : + MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getBitcast(MulVT, A); B = DAG.getBitcast(MulVT, B); @@ -16962,7 +18350,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); - EVT VT = Op0.getValueType(); + MVT VT = Op0.getSimpleValueType(); SDLoc dl(Op); assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || @@ -17034,7 +18422,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, dl); } -// Return true if the requred (according to Opcode) shift-imm form is natively +// Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { @@ -17054,14 +18442,14 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, } // The shift amount is a variable, but it is the same for all vector lanes. -// These instrcutions are defined together with shift-immediate. +// These instructions are defined together with shift-immediate. static bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } -// Return true if the requred (according to Opcode) variable-shift form is +// Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { @@ -17133,27 +18521,37 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // i64 SRA needs to be performed as partial shifts. if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Op.getOpcode() == ISD::SRA) + Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) return ArithmeticShiftRight64(ShiftAmt); - if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + if (VT == MVT::v16i8 || + (Subtarget->hasInt256() && VT == MVT::v32i8) || + VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - if (Op.getOpcode() == ISD::SHL) { - // Simple i8 add case - if (ShiftAmt == 1) - return DAG.getNode(ISD::ADD, dl, VT, R, R); + // Simple i8 add case + if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + + // ashr(R, 7) === cmp_slt(R, 0) + if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // XOP can shift v16i8 directly instead of as shift v8i16 + mask. + if (VT == MVT::v16i8 && Subtarget->hasXOP()) + return SDValue(); + if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - SmallVector<SDValue, 32> V( - NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -17161,24 +18559,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. - SmallVector<SDValue, 32> V( - NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { - if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); - } - - // R s>> a === ((R u>> a) ^ m) - m + // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 32> V(NumElts, - DAG.getConstant(128 >> ShiftAmt, dl, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); + + SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -17189,35 +18577,51 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Amt.getOpcode() == ISD::BITCAST && - Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { + + // Peek through any splat that was introduced for i64 shift vectorization. + int SplatIndex = -1; + if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) + if (SVN->isSplat()) { + SplatIndex = SVN->getSplatIndex(); + Amt = Amt.getOperand(0); + assert(SplatIndex < (int)VT.getVectorNumElements() && + "Splat shuffle referencing second operand"); + } + + if (Amt.getOpcode() != ISD::BITCAST || + Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; + unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); for (unsigned i = 0; i != Ratio; ++i) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } - // Check remaining shift amounts. - for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { - uint64_t ShAmt = 0; - for (unsigned j = 0; j != Ratio; ++j) { - ConstantSDNode *C = - dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); - if (!C) + + // Check remaining shift amounts (if not a splat). + if (SplatIndex < 0) { + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); + if (!C) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) return SDValue(); - // 6 == Log2(64) - ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } - if (ShAmt != ShiftAmt) - return SDValue(); } if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) @@ -17245,7 +18649,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { SDValue BaseShAmt; - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { // Check if this build_vector node is doing a splat. @@ -17262,7 +18666,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && + assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && "Unexpected shuffle index found!"); BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { @@ -17327,11 +18731,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) - return V; + return V; if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) return Op; + // XOP has 128-bit variable logical/arithmetic shifts. + // +ve/-ve Amt = shift left/right. + if (Subtarget->hasXOP() && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v8i16 || VT == MVT::v16i8)) { + if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { + SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); + Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); + } + if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); + if (Op.getOpcode() == ISD::SRA) + return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); + } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { @@ -17343,6 +18762,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // i64 vector arithmetic shift can be emulated with the transform: + // M = lshr(SIGN_BIT, Amt) + // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + Op.getOpcode() == ISD::SRA) { + SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); + R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + R = DAG.getNode(ISD::XOR, dl, VT, R, M); + R = DAG.getNode(ISD::SUB, dl, VT, R, M); + return R; + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -17351,9 +18783,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { SmallVector<SDValue, 8> Elts; - EVT SVT = VT.getScalarType(); + MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); - const APInt &One = APInt(SVTBits, 1); + APInt One(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i=0; i !=NumElems; ++i) { @@ -17364,7 +18796,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } ConstantSDNode *ND = cast<ConstantSDNode>(Op); - const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); + APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); @@ -17443,7 +18875,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. - EVT CastVT = MVT::v4i32; + MVT CastVT = MVT::v4i32; SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); @@ -17507,7 +18939,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } - if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) { + if (VT == MVT::v16i8 || + (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); @@ -17627,7 +19060,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } - if (Subtarget->hasInt256() && VT == MVT::v16i16) { + if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); @@ -17710,7 +19143,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors SDValue V1 = Extract128BitVector(R, 0, DAG, dl); @@ -17743,6 +19176,40 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); } +static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + assert(VT.isVector() && "Custom lowering only for vector rotates!"); + assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); + assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + + // XOP has 128-bit vector variable + immediate rotates. + // +ve/-ve Amt = rotate left/right. + + // Split 256-bit integers. + if (VT.is256BitVector()) + return Lower256IntArith(Op, DAG); + + assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); + + // Attempt to rotate by immediate. + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *RotateConst = BVAmt->getConstantSplatNode()) { + uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); + assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + return DAG.getNode(X86ISD::VPROTI, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Use general rotate by variable (per-element). + return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); +} + static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering @@ -17759,8 +19226,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; @@ -17775,8 +19241,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; @@ -17827,7 +19292,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). -bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { +bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) @@ -17844,21 +19309,23 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Note: this turns large loads into lock cmpxchg8b/16b. // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. -bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); - return needsCmpXchgNb(PTy->getElementType()); + return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { - return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg - : AtomicRMWExpansionKind::None; + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); @@ -17869,14 +19336,14 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. - return AtomicRMWExpansionKind::None; + return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. - return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg - : AtomicRMWExpansionKind::None; + return !AI->use_empty() ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -17884,7 +19351,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. - return AtomicRMWExpansionKind::CmpXChg; + return AtomicExpansionKind::CmpXChg; } } @@ -17898,7 +19365,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) { LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. @@ -17926,7 +19393,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isAtLeastRelease(Order) but it is not clear - // otherwise, we might be able to be more agressive on relaxed idempotent + // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SynchScope == SingleThread) @@ -18043,7 +19510,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SDValue InVec = Op->getOperand(0); SDLoc dl(Op); unsigned NumElts = SrcVT.getVectorNumElements(); - EVT SVT = SrcVT.getVectorElementType(); + MVT SVT = SrcVT.getVectorElementType(); // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. @@ -18103,7 +19570,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } @@ -18119,9 +19587,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); - High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. @@ -18311,7 +19780,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(Op.getValueType().isVector() && + assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } @@ -18357,7 +19826,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getNode()->getSimpleValueType(0); + MVT VT = Op.getNode()->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -18435,31 +19904,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +/// Widen a vector input to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false) { + // Check if InOp already has the right width. + MVT InVT = InOp.getSimpleValueType(); + if (InVT == NVT) + return InOp; + + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && + "Unexpected request for vector widening"); + + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { + SmallVector<SDValue, 16> Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back(FillVal); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : + DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, DAG.getIntPtrConstant(0, dl)); +} + static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); - EVT VT = N->getValue().getValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; + } + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); +} + +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; + MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); + SDValue DataToStore = N->getValue(); + MVT VT = DataToStore.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); } return Op; } @@ -18470,17 +20111,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); - EVT VT = Op.getValueType(); - assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); SDLoc dl(Op); - + MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } @@ -18572,6 +20255,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCCE: return LowerSETCCE(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -18592,12 +20276,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); - case ISD::CTLZ: return LowerCTLZ(Op, DAG); - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); + case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -18615,7 +20301,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); + case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: @@ -18634,14 +20326,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); - if (VT != MVT::v2f32) - llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); + assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); @@ -18668,17 +20389,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: - // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert - // (FP_TO_SINT (load f16)) to FP_TO_INT*. - if (N->getOperand(0).getValueType() == MVT::f16) - break; - // fallthrough case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; - if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) - return; - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -18707,6 +20420,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; @@ -18740,6 +20454,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } + case ISD::INTRINSIC_WO_CHAIN: { + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); @@ -18748,7 +20467,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; - EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; + MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, dl, HalfT)); @@ -18884,6 +20603,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; @@ -18910,6 +20630,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::ABS: return "X86ISD::ABS"; + case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -18937,12 +20658,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -18978,6 +20701,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -19000,6 +20724,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -19009,11 +20734,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; + case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; + case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -19022,10 +20749,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SFENCE: return "X86ISD::SFENCE"; case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; - case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; + case X86ISD::VPROT: return "X86ISD::VPROT"; + case X86ISD::VPROTI: return "X86ISD::VPROTI"; + case X86ISD::VPSHA: return "X86ISD::VPSHA"; + case X86ISD::VPSHL: return "X86ISD::VPSHL"; + case X86ISD::VPCOM: return "X86ISD::VPCOM"; + case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -19038,7 +20772,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; - case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE"; + case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -19064,6 +20800,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; + case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; + case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; } return nullptr; } @@ -19218,7 +20956,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -19253,11 +20991,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return false; // Not for i1 vectors - if (VT.getScalarType() == MVT::i1) + if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. - if (VT.getSizeInBits() == 64) + if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can @@ -19282,8 +21020,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // @@ -19531,8 +21268,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); @@ -19702,8 +21438,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++MBB->getIterator(); MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); @@ -19727,7 +21462,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); - if (!Subtarget->isTargetWin64()) { + if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); @@ -19744,9 +21479,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = - F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand *MMO = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) @@ -19800,6 +21534,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, return true; } +// Return true if it is OK for this CMOV pseudo-opcode to be cascaded +// together with other CMOV pseudo-opcodes into a single basic-block with +// conditional jump around it. +static bool isCMOVPseudo(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return true; + + default: + return false; + } +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -19811,8 +21578,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -19823,8 +21589,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); - // We also lower double CMOVs: + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2, we lower cascaded CMOVs such as + // // (CMOV (CMOV F, T, cc1), T, cc2) + // // to two successives branches. For that, we look for another CMOV as the // following instruction. // @@ -19890,19 +21689,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // .LBB5_4: // retq // - MachineInstr *NextCMOV = nullptr; + MachineInstr *CascadedCMOV = nullptr; + MachineInstr *LastCMOV = MI; + X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); - if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + + // Check for case 1, where there are multiple CMOVs with the same condition + // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the + // number of jumps the most. + + if (isCMOVPseudo(MI)) { + // See if we have a string of CMOVS with the same condition. + while (NextMIIt != BB->end() && + isCMOVPseudo(NextMIIt) && + (NextMIIt->getOperand(3).getImm() == CC || + NextMIIt->getOperand(3).getImm() == OppCC)) { + LastCMOV = &*NextMIIt; + ++NextMIIt; + } + } + + // This checks for case 2, but only do this if we didn't already find + // case 1, as indicated by LastCMOV == MI. + if (LastCMOV == MI && + NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) - NextCMOV = &*NextMIIt; + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + CascadedCMOV = &*NextMIIt; + } MachineBasicBlock *jcc1MBB = nullptr; - // If we have a double CMOV, we lower it to two successive branches to + // If we have a cascaded CMOV, we lower it to two successive branches to // the same block. EFLAGS is used by both, so mark it as live in the second. - if (NextCMOV) { + if (CascadedCMOV) { jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, jcc1MBB); jcc1MBB->addLiveIn(X86::EFLAGS); @@ -19917,7 +21739,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -19926,12 +21748,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); + std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - if (NextCMOV) { - // The fallthrough block may be jcc1MBB, if we have a double CMOV. + if (CascadedCMOV) { + // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. BB->addSuccessor(jcc1MBB); // In that case, jcc1MBB will itself fallthrough the copy0MBB, and @@ -19946,13 +21768,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. - unsigned Opc = - X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); - if (NextCMOV) { + if (CascadedCMOV) { unsigned Opc2 = X86::GetCondBranchFromCond( - (X86::CondCode)NextCMOV->getOperand(3).getImm()); + (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); } @@ -19964,28 +21785,110 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - MachineInstrBuilder MIB = - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); + DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; + MachineInstrBuilder MIB; + + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. - // If we have a double CMOV, the second Jcc provides the same incoming + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, + TII->get(X86::PHI), DestReg) + .addReg(Op1Reg).addMBB(copy0MBB) + .addReg(Op2Reg).addMBB(thisMBB); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). - if (NextCMOV) { + if (CascadedCMOV) { MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), - DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + DL, TII->get(TargetOpcode::COPY), + CascadedCMOV->getOperand(0).getReg()) .addReg(MI->getOperand(0).getReg()); - NextCMOV->eraseFromParent(); + CascadedCMOV->eraseFromParent(); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Now remove the CMOV(s). + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) + (MIIt++)->eraseFromParent(); + return sinkMBB; } MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineOperand MSrc = MI->getOperand(0); + unsigned VSrc = MI->getOperand(5).getReg(); + const MachineOperand &Disp = MI->getOperand(3); + MachineOperand ZeroDisp = MachineOperand::CreateImm(0); + bool hasDisp = Disp.isGlobal() || Disp.isImm(); + if (hasDisp && MSrc.isReg()) + MSrc.setIsKill(false); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -20032,8 +21935,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, sizeVReg = MI->getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; - MachineFunction::iterator MBBIter = BB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); @@ -20120,14 +22022,60 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { + assert(!Subtarget->isTargetMachO()); DebugLoc DL = MI->getDebugLoc(); + MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( + *BB->getParent(), *BB, MI, DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return ResumeBB; +} - assert(!Subtarget->isTargetMachO()); +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); + DebugLoc DL = MI->getDebugLoc(); - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && + "SEH does not use catchret!"); - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Only 32-bit EH needs to worry about manually restoring stack pointers. + if (!Subtarget->is32Bit()) + return BB; + + // C++ EH creates a new target block to hold the restore code, and wires up + // the new block to the return destination with a normal JMP_4. + MachineBasicBlock *RestoreMBB = + MF->CreateMachineBasicBlock(BB->getBasicBlock()); + assert(BB->succ_size() == 1); + MF->insert(std::next(BB->getIterator()), RestoreMBB); + RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(RestoreMBB); + MI->getOperand(0).setMBB(RestoreMBB); + + auto RestoreMBBI = RestoreMBB->begin(); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const Constant *PerFn = MF->getFunction()->getPersonalityFn(); + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); + // Only 32-bit SEH requires special handling for catchpad. + if (IsSEH && Subtarget->is32Bit()) { + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); + } + MI->eraseFromParent(); return BB; } @@ -20149,6 +22097,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = + Subtarget->is64Bit() ? + Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, @@ -20198,8 +22148,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); @@ -20225,7 +22174,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // For v = setjmp(buf), we generate // // thisMBB: - // buf[LabelOffset] = restoreMBB + // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: @@ -20245,6 +22194,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); + restoreMBB->setHasAddressTaken(); MachineInstrBuilder MIB; @@ -20511,35 +22461,44 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); + case X86::CATCHRET: + return EmitLoweredCatchRet(MI, BB); + case X86::CATCHPAD: + return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); - case X86::CMOV_GR8: case X86::CMOV_FR32: case X86::CMOV_FR64: - case X86::CMOV_V4F32: + case X86::CMOV_FR128: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: - case X86::CMOV_V8F32: + case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: + case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: - case X86::CMOV_GR16: - case X86::CMOV_GR32: - case X86::CMOV_RFP32: - case X86::CMOV_RFP64: - case X86::CMOV_RFP80: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -20793,7 +22752,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getValueType().getScalarType().getSizeInBits(); + return Op.getValueType().getScalarSizeInBits(); // Fallback case. return 1; @@ -20814,39 +22773,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -20854,7 +22782,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && @@ -20920,24 +22848,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -20966,10 +22876,22 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, MVT RootVT = Root.getSimpleValueType(); SDLoc DL(Root); - // Just remove no-op shuffle masks. if (Mask.size() == 1) { - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), - /*AddTo*/ true); + int Index = Mask[0]; + assert((Index >= 0 || Index == SM_SentinelUndef || + Index == SM_SentinelZero) && + "Invalid shuffle index found!"); + + // We may end up with an accumulated mask of size 1 as a result of + // widening of shuffle operands (see function canWidenShuffleElements). + // If the only shuffle index is equal to SM_SentinelZero then propagate + // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle + // mask, and therefore the entire chain of shuffles can be folded away. + if (Index == SM_SentinelZero) + DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); + else + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), + /*AddTo*/ true); return true; } @@ -20985,7 +22907,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // doesn't preclude something switching to the shorter encoding post-RA. // // FIXME: Should teach these routines about AVX vector widths. - if (FloatDomain && VT.getSizeInBits() == 128) { + if (FloatDomain && VT.is128BitVector()) { if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { bool Lo = Mask.equals({0, 0}); unsigned Shuffle; @@ -21049,7 +22971,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. - if (!FloatDomain && VT.getSizeInBits() == 128 && + if (!FloatDomain && VT.is128BitVector() && (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || @@ -21226,26 +23148,28 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { - case X86ISD::PSHUFB: - HasPSHUFB = true; - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - if (Op.getOperand(0).hasOneUse() && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); - // We can't check for single use, we have to check that this shuffle is the only user. - if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && + "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the + // only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; } // Minor canonicalization of the accumulated shuffle mask to make it easier @@ -21360,8 +23284,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getSimpleValueType().getScalarType() != MVT::i8 && - V.getSimpleValueType().getScalarType() != MVT::i16) + if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && + V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -21438,7 +23362,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, return V; } -/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or +/// pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking /// through shuffles which switch halves trying to find a shuffle of the same @@ -21520,6 +23445,41 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; + case X86ISD::UNPCKL: { + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in + // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE + // moves upper half elements into the lower half part. For example: + // + // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, + // undef:v16i8 + // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 + // + // will be combined to: + // + // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 + + // This is only for 128-bit vectors. From SSE4.1 onward this combine may not + // happen due to advanced instructions. + if (!VT.is128BitVector()) + return SDValue(); + + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + if (Op0.getOpcode() == ISD::UNDEF && + Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); + + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 8> ExpectedMask(NumElts, -1); + std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, + NumElts / 2); + + auto ShufOp = Op1.getOperand(0); + if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); + } + return SDValue(); + } default: return SDValue(); } @@ -21535,7 +23495,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); + assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -21624,14 +23584,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { return SDValue(); auto *SVN = cast<ShuffleVectorSDNode>(N); - ArrayRef<int> Mask = SVN->getMask(); + SmallVector<int, 8> Mask; + for (int M : SVN->getMask()) + Mask.push_back(M); + SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the SUB node, and the second to - // be the ADD node. - // FIXME: We should support the commuted patterns. - if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + // We require the first shuffle operand to be the FSUB node, and the second to + // be the FADD node. + if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. @@ -21682,7 +23647,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -21866,21 +23831,45 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } -/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are -/// special and don't usually play with other vector types, it's better to -/// handle them early to be sure we emit efficient code by avoiding -/// store-load conversions. -static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::x86mmx || - N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || - N->getOperand(0)->getValueType(0) != MVT::v2i32) - return SDValue(); +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); - SDValue V = N->getOperand(0); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1)); - if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), - N->getValueType(0), V.getOperand(0)); + // Detect bitcasts between i32 to x86mmx low word. Since MMX types are + // special and don't usually play with other vector types, it's better to + // handle them early to be sure we emit efficient code by avoiding + // store-load conversions. + if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && + N0.getValueType() == MVT::v2i32 && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); + } + + // Convert a bitcasted integer logic operation that has one bitcasted + // floating-point operand and one constant operand into a floating-point + // logic operation. This may create a load of the constant, but that is + // cheaper than materializing the constant in an integer register and + // transferring it to an SSE register or transferring the SSE operand to + // integer register and back. + unsigned FPOpcode; + switch (N0.getOpcode()) { + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; + default: return SDValue(); + } + if (((Subtarget->hasSSE1() && VT == MVT::f32) || + (Subtarget->hasSSE2() && VT == MVT::f64)) && + isa<ConstantSDNode>(N0.getOperand(1)) && + N0.getOperand(0).getOpcode() == ISD::BITCAST && + N0.getOperand(0).getOperand(0).getValueType() == VT) { + SDValue N000 = N0.getOperand(0).getOperand(0); + SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); + return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); + } return SDValue(); } @@ -21910,26 +23899,26 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, InputVector.getNode()->getOperand(0)); // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). - SDValue MMXSrcOp = MMXSrc.getOperand(0); if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && - MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() && - MMXSrcOp.getOpcode() == ISD::BITCAST && - MMXSrcOp.getValueType() == MVT::v1i64 && - MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), - MMXSrcOp.getOperand(0)); + MMXSrc.getValueType() == MVT::i64) { + SDValue MMXSrcOp = MMXSrc.getOperand(0); + if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && + MMXSrcOp.getValueType() == MVT::v1i64 && + MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), MMXSrcOp.getOperand(0)); + } } EVT VT = N->getValueType(0); - if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) && + if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && InputVector.getOpcode() == ISD::BITCAST && - dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) { + isa<ConstantSDNode>(InputVector.getOperand(0))) { uint64_t ExtractedElt = - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); uint64_t InputValue = - cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } @@ -22036,96 +24025,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. -static std::pair<unsigned, bool> -matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const X86Subtarget *Subtarget) { - if (!VT.isVector()) - return std::make_pair(0, false); - - bool NeedSplit = false; - switch (VT.getSimpleVT().SimpleTy) { - default: return std::make_pair(0, false); - case MVT::v4i64: - case MVT::v2i64: - if (!Subtarget->hasVLX()) - return std::make_pair(0, false); - break; - case MVT::v64i8: - case MVT::v32i16: - if (!Subtarget->hasBWI()) - return std::make_pair(0, false); - break; - case MVT::v16i32: - case MVT::v8i64: - if (!Subtarget->hasAVX512()) - return std::make_pair(0, false); - break; - case MVT::v32i8: - case MVT::v16i16: - case MVT::v8i32: - if (!Subtarget->hasAVX2()) - NeedSplit = true; - if (!Subtarget->hasAVX()) - return std::make_pair(0, false); - break; - case MVT::v16i8: - case MVT::v8i16: - case MVT::v4i32: - if (!Subtarget->hasSSE2()) - return std::make_pair(0, false); - } - - // SSE2 has only a small subset of the operations. - bool hasUnsigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v16i8); - bool hasSigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v8i16); - - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - unsigned Opc = 0; - // Check for x CC y ? x : y. - if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && - DAG.isEqualTo(RHS, Cond.getOperand(1))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? ISD::UMIN : 0; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? ISD::UMAX : 0; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? ISD::SMIN : 0; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? ISD::SMAX : 0; break; - } - // Check for x CC y ? y : x -- a min/max with reversed arms. - } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && - DAG.isEqualTo(RHS, Cond.getOperand(0))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? ISD::UMAX : 0; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? ISD::UMIN : 0; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? ISD::SMAX : 0; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? ISD::SMIN : 0; break; - } - } - - return std::make_pair(Opc, NeedSplit); -} - static SDValue transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -22189,7 +24088,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -22535,32 +24435,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to match a min/max vector operation. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { - std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); - unsigned Opc = ret.first; - bool NeedSplit = ret.second; - - if (Opc && NeedSplit) { - unsigned NumElems = VT.getVectorNumElements(); - // Extract the LHS vectors - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); - - // Extract the RHS vectors - SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); - SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); - - // Create min/max for each subvector - LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); - RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); - - // Merge the result - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); - } else if (Opc) - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - // Simplify vector selection if condition value type matches vselect // operand type if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { @@ -22635,7 +24509,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { - unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) @@ -22656,14 +24530,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. - if (VT.getScalarType() == MVT::i16) + if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. - if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41()) + if (VT.is128BitVector() && !Subtarget->hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 - if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 && - !Subtarget->hasAVX2()) + if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); @@ -22773,12 +24646,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; - ConstantSDNode *CS; - if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; - if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx == -1) break; @@ -22857,8 +24727,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { - ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); - if (!CondOp1C || !CondOp1C->isNullValue()) + if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); @@ -23102,106 +24971,15 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); - // SSE/AVX/AVX2 blend intrinsics. - case Intrinsic::x86_avx2_pblendvb: - // Don't try to simplify this intrinsic if we don't have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx_blendv_ps_256: - // Don't try to simplify this intrinsic if we don't have AVX. - if (!Subtarget->hasAVX()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_sse41_pblendvb: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - SDValue Mask = N->getOperand(3); - - // Don't try to simplify this intrinsic if we don't have SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - - // fold (blend A, A, Mask) -> A - if (Op0 == Op1) - return Op0; - // fold (blend A, B, allZeros) -> A - if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return Op0; - // fold (blend A, B, allOnes) -> B - if (ISD::isBuildVectorAllOnes(Mask.getNode())) - return Op1; - - // Simplify the case where the mask is a constant i32 value. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { - if (C->isNullValue()) - return Op0; - if (C->isAllOnesValue()) - return Op1; - } - - return SDValue(); - } - - // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - EVT VT = Op0.getValueType(); - assert(VT.isVector() && "Expected a vector type!"); - - if (isa<BuildVectorSDNode>(Op1)) - Op1 = Op1.getOperand(0); - - if (!isa<ConstantSDNode>(Op1)) - return SDValue(); - - EVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - - ConstantSDNode *CND = cast<ConstantSDNode>(Op1); - const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - - // Don't try to convert this shift into a ISD::SRA if the shift - // count is bigger than or equal to the element size. - if (ShAmt >= SVTBits) - return SDValue(); - - // Trivial case: if the shift count is zero, then fold this - // into the first operand. - if (ShAmt == 0) - return Op0; - - // Replace this packed shift intrinsic with a target independent - // shift dag node. - SDLoc DL(N); - SDValue Splat = DAG.getConstant(C, DL, VT); - return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); - } - } -} - /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + // An imul is usually smaller than the alternative sequence. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -23228,9 +25006,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, MulAmt1 = 3; MulAmt2 = MulAmt / 3; } + + SDLoc DL(N); + SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -23239,7 +25019,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, // is an add. std::swap(MulAmt1, MulAmt2); - SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); @@ -23253,10 +25032,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + } + + if (!NewMul) { + assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) + && "Both cases that could cause potential overflows should have " + "already been handled."); + if (isPowerOf2_64(MulAmt - 1)) + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt - 1), DL, + MVT::i8))); + else if (isPowerOf2_64(MulAmt + 1)) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, + N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt + 1), + DL, MVT::i8)), N->getOperand(0)); + } + + if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); - } + return SDValue(); } @@ -23272,18 +25072,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == X86ISD::SETCC_CARRY || - ((N00.getOpcode() == ISD::ANY_EXTEND || - N00.getOpcode() == ISD::ZERO_EXTEND) && - N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { - APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); - APInt ShAmt = N1C->getAPIntValue(); - Mask = Mask.shl(ShAmt); - if (Mask != 0) { - SDLoc DL(N); - return DAG.getNode(ISD::AND, DL, VT, - N00, DAG.getConstant(Mask, DL, VT)); - } + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + bool MaskOK = false; + // We can handle cases concerning bit-widening nodes containing setcc_c if + // we carefully interrogate the mask to make sure we are semantics + // preserving. + // The transform is not safe if the result of C1 << C2 exceeds the bitwidth + // of the underlying setcc_c operation if the setcc_c was zero extended. + // Consider the following example: + // zext(setcc_c) -> i32 0x0000FFFF + // c1 -> i32 0x0000FFFF + // c2 -> i32 0x00000001 + // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE + // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if (N00.getOpcode() == ISD::SIGN_EXTEND && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || + N00.getOpcode() == ISD::ANY_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); + } + if (MaskOK && Mask != 0) { + SDLoc DL(N); + return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } @@ -23304,6 +25120,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); + APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -23321,14 +25190,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { APInt ShiftAmt = AmtSplat->getAPIntValue(); - unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); + unsigned MaxAmount = + VT.getSimpleVT().getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s // if the shift amount is bigger than or equal to // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT, Subtarget, DAG, DL); + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); @@ -23342,6 +25212,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (SDValue V = PerformSHLCombine(N, DAG)) return V; + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) @@ -23537,7 +25411,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), SDValue(RHSConstSplat, 0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); @@ -23552,9 +25426,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: { - unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); + unsigned InBits = NarrowVT.getScalarSizeInBits(); APInt Mask = APInt::getAllOnesValue(InBits); - Mask = Mask.zext(VT.getScalarType().getSizeInBits()); + Mask = Mask.zext(VT.getScalarSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(Mask, DL, VT)); } @@ -23656,6 +25530,41 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(N0.getValueType(), NewShuffle); } +/// If both input operands of a logic op are being cast from floating point +/// types, try to convert this into a floating point logic node to avoid +/// unnecessary moves from SSE to integer registers. +static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + unsigned FPOpcode = ISD::DELETED_NODE; + if (N->getOpcode() == ISD::AND) + FPOpcode = X86ISD::FAND; + else if (N->getOpcode() == ISD::OR) + FPOpcode = X86ISD::FOR; + else if (N->getOpcode() == ISD::XOR) + FPOpcode = X86ISD::FXOR; + + assert(FPOpcode != ISD::DELETED_NODE && + "Unexpected input node for FP logic conversion"); + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + ((Subtarget->hasSSE1() && VT == MVT::i32) || + (Subtarget->hasSSE2() && VT == MVT::i64))) { + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); + } + } + return SDValue(); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -23668,6 +25577,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -23728,6 +25640,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); @@ -23799,7 +25714,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasSSE41()) return SDValue(); - EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); @@ -23813,9 +25728,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -23913,17 +25826,188 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +// Try to turn tests against the signbit in the form of: +// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) +// into: +// SETGT(X, -1) +static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { + // This is only worth doing if the output type is i8. + if (N->getValueType(0) != MVT::i8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We should be performing an xor against a truncated shift. + if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) + return SDValue(); + + // Make sure we are performing an xor against one. + if (!isOneConstant(N1)) + return SDValue(); + + // SetCC on x86 zero extends so only act on this if it's a logical shift. + SDValue Shift = N0.getOperand(0); + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) + return SDValue(); + + // Make sure we are truncating from one of i16, i32 or i64. + EVT ShiftTy = Shift.getValueType(); + if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) + return SDValue(); + + // Make sure the shift amount extracts the sign bit. + if (!isa<ConstantSDNode>(Shift.getOperand(1)) || + Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + return SDValue(); + + // Create a greater-than comparison against -1. + // N.B. Using SETGE against 0 works but we want a canonical looking + // comparison, using SETGT matches up with what TranslateX86CC. + SDLoc DL(N); + SDValue ShiftOp = Shift.getOperand(0); + EVT ShiftOpTy = ShiftOp.getValueType(); + SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, + DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); + return Cond; +} + static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) + return RV; + if (Subtarget->hasCMov()) if (SDValue RV = performIntegerAbsCombine(N, DAG)) return RV; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + return SDValue(); +} + +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext <N x i8> %a to <N x i32> + // %2 = zext <N x i8> %b to <N x i32> + // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> + // %4 = add nuw nsw <N x i32> %3, %2 + // %5 = lshr <N x i32> %N, <i32 1 x N> + // %6 = trunc <N x i32> %5 to <N x i8> + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + return SDValue(); } @@ -23940,10 +26024,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); + bool Fast; + unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && + Ext == ISD::NON_EXTLOAD && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -24012,8 +26099,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) - && "WideVecVT should be legal"); + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); } @@ -24026,8 +26113,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -24055,7 +26142,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::NON_EXTLOAD); SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); - } /// PerformMSTORECombine - Resolve truncating stores static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -24073,6 +26159,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); @@ -24096,12 +26191,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) - && "WideVecVT should be legal"); + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -24133,8 +26228,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -24148,10 +26244,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. + bool Fast; + unsigned AddressSpace = St->getAddressSpace(); unsigned Alignment = St->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - StVT == VT && !IsAligned) { + if (VT.is256BitVector() && StVT == VT && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -24178,12 +26276,29 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. @@ -24306,7 +26421,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget->is64Bit() || F64IsLegal) { - EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), @@ -24539,8 +26654,234 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT OutSVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT InSVT = InVT.getVectorElementType(); + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && + "OutSVT can only be either i8 or i16."); + SDValue MaskVal = + DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (OutSVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + ((OutSVT == MVT::i8 && InSVT != MVT::i64) || + (InSVT == MVT::i32 && OutSVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector<SDValue, 8> SubVec(RegNum); + if (InSVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (InSVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on floating point negations. +static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } + + // If we're negating a FMA node, then we can adjust the + // instruction to include the extra negation. + if (Arg.hasOneUse()) { + switch (Arg.getOpcode()) { + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + } + } + return SDValue(); +} + +static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT.is512BitVector() && !Subtarget->hasDQI()) { + // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. + // These logic operations may be executed in the integer domain. + SDLoc dl(N); + MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); + + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); + unsigned IntOpcode = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + } + SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); + } + return SDValue(); +} /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. -static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x @@ -24552,7 +26893,8 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); - return SDValue(); + + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. @@ -24576,8 +26918,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (Subtarget->useSoftFloat()) + return SDValue(); + + // TODO: Check for global or instruction-level "nnan". In that case, we + // should be able to lower to FMAX/FMIN alone. + // TODO: If an operand is already known to be a NaN or not a NaN, this + // should be an optional swap and FMAX/FMIN. + + EVT VT = N->getValueType(0); + if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + return SDValue(); + + // This takes at least 3 instructions, so favor a library call when operating + // on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), VT); + + // There are 4 possibilities involving NaN inputs, and these are the required + // outputs: + // Op1 + // Num NaN + // ---------------- + // Num | Max | Op0 | + // Op0 ---------------- + // NaN | Op1 | NaN | + // ---------------- + // + // The SSE FP max/min instructions were not designed for this case, but rather + // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 + // Max = Op1 > Op0 ? Op1 : Op0 + // + // So they always return Op0 if either input is a NaN. However, we can still + // use those instructions for fmaxnum by selecting away a NaN input. + + // If either operand is NaN, the 2nd source operand (Op0) is passed through. + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + + // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands + // are NaN, the NaN value of Op1 is the result. + auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); +} + /// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -24588,11 +26987,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -24603,7 +27003,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } static SDValue PerformBTCombine(SDNode *N, @@ -24673,6 +27073,57 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// sext(add_nsw(x, C)) --> add(sext(x), C_sext) +/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities +/// to combine math ops, use an LEA, or use a complex addressing mode. This can +/// eliminate extend, add, and shift instructions. +static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // TODO: This should be valid for other integer types. + EVT VT = Sext->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + // We need an 'add nsw' feeding into the 'sext'. + SDValue Add = Sext->getOperand(0); + if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) + return SDValue(); + + // Having a constant operand to the 'add' ensures that we are not increasing + // the instruction count because the constant is extended for free below. + // A constant operand can also become the displacement field of an LEA. + auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); + if (!AddOp1) + return SDValue(); + + // Don't make the 'add' bigger if there's no hope of combining it with some + // other 'add' or 'shl' instruction. + // TODO: It may be profitable to generate simpler LEA instructions in place + // of single 'add' instructions, but the cost model for selecting an LEA + // currently has a high threshold. + bool HasLEAPotential = false; + for (auto *User : Sext->uses()) { + if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { + HasLEAPotential = true; + break; + } + } + if (!HasLEAPotential) + return SDValue(); + + // Everything looks good, so pull the 'sext' ahead of the 'add'. + int64_t AddConstant = AddOp1->getSExtValue(); + SDValue AddOp0 = Add.getOperand(0); + SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); + SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); + + // The wider add is guaranteed to not wrap because both operands are + // sign-extended. + SDNodeFlags Flags; + Flags.setNoSignedWrap(true); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -24763,13 +27214,13 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, } } - if (!Subtarget->hasFp256()) - return SDValue(); - - if (VT.isVector() && VT.getSizeInBits() == 256) + if (Subtarget->hasAVX() && VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; + if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) + return NewAdd; + return SDValue(); } @@ -24783,9 +27234,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -24830,8 +27279,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!C || C->getZExtValue() != 1) + if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, @@ -24884,21 +27332,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) - if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) - if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { @@ -24936,52 +27382,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), dl, - Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc dl(N); - MVT VT = N->getOperand(1)->getSimpleValueType(0); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { - // Extract the countS bits from the immediate so we can get the proper - // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, insertps doesn't use - // countS and just gets an f32 from that address. - unsigned DestIndex = - cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; - - Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); - - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); - } - return SDValue(); -} - static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { SDValue V0 = N->getOperand(0); SDValue V1 = N->getOperand(1); @@ -25008,6 +27408,20 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } + return SDValue(); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -25182,7 +27596,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. - if (Op0.getOpcode() == ISD::LOAD) { + if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); EVT LdVT = Ld->getValueType(0); @@ -25357,15 +27771,14 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, } // Check if we can bypass extracting and re-inserting an element of an input - // vector. Essentialy: + // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); - if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) - if (ExtractIdx->getZExtValue() == 0) { + if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { @@ -25394,7 +27807,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -25414,12 +27827,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: - case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); - case X86ISD::FAND: return PerformFANDCombine(N, DAG); - case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); + case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -25447,14 +27865,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: - return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) - return PerformINSERTPSCombine(N, DAG, Subtarget); - break; - } case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); @@ -26084,6 +28497,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -26168,17 +28582,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { unsigned Size = VT.getSizeInBits(); - MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 - : Size == 16 ? MVT::i16 - : Size == 32 ? MVT::i32 - : Size == 64 ? MVT::i64 - : MVT::Other; - unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (Size == 1) Size = 8; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { Res.first = DestReg; - Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass - : SimpleTy == MVT::i16 ? &X86::GR16RegClass - : SimpleTy == MVT::i32 ? &X86::GR32RegClass + Res.second = Size == 8 ? &X86::GR8RegClass + : Size == 16 ? &X86::GR16RegClass + : Size == 32 ? &X86::GR32RegClass : &X86::GR64RegClass; assert(Res.second->contains(Res.first) && "Register in register class"); } else { @@ -26196,6 +28606,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // target independent register mapper will just pick the first match it can // find, ignoring the required type. + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) @@ -26244,6 +28655,15 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool X86TargetLowering::isTargetFTOL() const { - return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on x86 is expensive. However, when aggressively optimizing + // for code size, we prefer to use a div instruction, as it is usually smaller + // than the alternative sequence. + // The exception to this is vector division. Since x86 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize); + return OptSize && !VT.isVector(); } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 723d5304495c..a29dc9af54f6 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -126,6 +126,9 @@ namespace llvm { /// 1 is the number of bytes of stack to pop. RET_FLAG, + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, @@ -182,6 +185,8 @@ namespace llvm { /// Compute Sum of Absolute Differences. PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, /// Bitwise Logical AND NOT of Packed FP values. ANDNP, @@ -211,6 +216,8 @@ namespace llvm { // FP vector get exponent FGETEXP_RND, + // Extract Normalized Mantissas + VGETMANT, // FP Scale SCALEF, // Integer add/sub with unsigned saturation. @@ -236,6 +243,9 @@ namespace llvm { // Integer absolute value ABS, + // Detect Conflicts Within a Vector + CONFLICT, + /// Floating point max and min. FMAX, FMIN, @@ -282,9 +292,8 @@ namespace llvm { // Vector integer truncate. VTRUNC, - - // Vector integer truncate with mask. - VTRUNCM, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, VTRUNCS, // Vector FP extend. VFPEXT, @@ -295,6 +304,9 @@ namespace llvm { // Vector signed/unsigned integer to double. CVTDQ2PD, CVTUDQ2PD, + // Convert a vector to mask, set bits base on MSB. + CVT2MASK, + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -349,6 +361,7 @@ namespace llvm { // OR/AND test for masks KORTEST, + KTEST, // Several flavors of instructions with vector shuffle behaviors. PACKSS, @@ -382,12 +395,24 @@ namespace llvm { VPERMIV3, VPERMI, VPERM2X128, - //Fix Up Special Packed Float32/64 values + // Bitwise ternary logic + VPTERNLOG, + // Fix Up Special Packed Float32/64 values VFIXUPIMM, - //Range Restriction Calculation For Packed Pairs of Float32/64 values + // Range Restriction Calculation For Packed Pairs of Float32/64 values VRANGE, + // Reduce - Perform Reduction Transformation on scalar\packed FP + VREDUCE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits + VRNDSCALE, + // VFPCLASS - Tests Types Of a FP Values for packed types. + VFPCLASS, + // VFPCLASSS - Tests Types Of a FP Values for scalar types. + VFPCLASSS, // Broadcast scalar to vector VBROADCAST, + // Broadcast mask to vector + VBROADCASTM, // Broadcast subvector to vector SUBV_BROADCAST, // Insert/Extract vector element @@ -397,13 +422,21 @@ namespace llvm { /// SSE4A Extraction and Insertion. EXTRQI, INSERTQI, + // XOP variable/immediate rotations + VPROT, VPROTI, + // XOP arithmetic/logical shifts + VPSHA, VPSHL, + // XOP signed/unsigned integer comparisons + VPCOM, VPCOMU, + // Vector multiply packed unsigned doubleword integers PMULUDQ, // Vector multiply packed signed doubleword integers PMULDQ, // Vector Multiply Packed UnsignedIntegers with Round and Scale MULHRS, - + // Multiply and Add Packed Integers + VPMADDUBSW, VPMADDWD, // FMA nodes FMADD, FNMADD, @@ -418,7 +451,6 @@ namespace llvm { FNMSUB_RND, FMADDSUB_RND, FMSUBADD_RND, - RNDSCALE, // Compress and expand COMPRESS, @@ -443,9 +475,6 @@ namespace llvm { // falls back to heap allocation if not. SEG_ALLOCA, - // Windows's _ftol2 runtime routine to do fptoui. - WIN_FTOL, - // Memory barrier MEMBARRIER, MFENCE, @@ -580,15 +609,6 @@ namespace llvm { bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt); - /// AVX512 static rounding constants. These need to match the values in - /// avx512fintrin.h. - enum STATIC_ROUNDING { - TO_NEAREST_INT = 0, - TO_NEG_INF = 1, - TO_POS_INF = 2, - TO_ZERO = 3, - CUR_DIRECTION = 4 - }; } //===--------------------------------------------------------------------===// @@ -850,16 +870,7 @@ namespace llvm { /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 - } - - /// Return true if the target uses the MSVC _ftol2 routine for fptoui. - bool isTargetFTOL() const; - - /// Return true if the MSVC _ftol2 routine should be used for fptoui to the - /// given type. - bool isIntegerTypeFTOL(EVT VT) const { - return isTargetFTOL() && VT == MVT::i64; + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } /// \brief Returns true if it is beneficial to convert a load of a constant @@ -879,6 +890,16 @@ namespace llvm { unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, @@ -890,6 +911,11 @@ namespace llvm { bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const override; + /// Return true if the target stores SafeStack pointer at a fixed offset in + /// some non-standard address space, and populates the address space and + /// offset as appropriate. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; @@ -899,6 +925,8 @@ namespace llvm { /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -908,7 +936,6 @@ namespace llvm { /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - const DataLayout *TD; /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. @@ -955,7 +982,6 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const; - bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const; @@ -969,7 +995,6 @@ namespace llvm { SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; @@ -994,9 +1019,9 @@ namespace llvm { SDValue LowerToBT(SDValue And, ISD::CondCode CC, SDLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; @@ -1042,27 +1067,16 @@ namespace llvm { const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; - bool needsCmpXchgNb(const Type *MemType) const; - - /// Utility function to emit atomic-load-arith operations (and, or, xor, - /// nand, max, min, umax, umin). It takes the corresponding instruction to - /// expand, the associated machine basic block, and the associated X86 - /// opcodes for reg/reg. - MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const; - - /// Utility function to emit atomic-load-arith operations (and, or, xor, - /// nand, add, sub, swap) for 64-bit operands on 32-bit target. - MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const; + bool needsCmpXchgNb(Type *MemType) const; // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock *EmitVAARG64WithCustomInserter( @@ -1077,18 +1091,24 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const; - MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const; @@ -1121,7 +1141,7 @@ namespace llvm { unsigned &RefinementSteps) const override; /// Reassociate floating point divisions into multiply by reciprocal. - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; }; namespace X86 { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index faa91500b181..8bf2925a75db 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -79,7 +79,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (TypeVariantName, "i"), !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), + !if (!eq (Size, 512), !if (!eq (EltSize, 64), "v8i64", "v16i32"), VTName))), VTName)); @@ -145,6 +145,8 @@ def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; // We map scalar types to the smallest (128-bit) vector type // with the appropriate element type. This allows to use the same masking logic. +def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; +def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; @@ -274,6 +276,22 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; +// Similar to AVX512_maskable_3rc but in this case the input VT for the tied +// operand differs from the output VT. This requires a bitconvert on +// the preserved vector going into the vselect. +multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, + X86VectorVTInfo InVT, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, OutVT, Outs, + !con((ins InVT.RC:$src1), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect InVT.KRCWM:$mask, RHS, + (bitconvert InVT.RC:$src1))>; + multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, @@ -471,84 +489,123 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // - -multiclass vinsert_for_size_no_alt<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - PatFrag vinsert_insert, - SDNodeXForm INSERT_get_vinsert_imm> { +multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vinsert_insert> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { - def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, From.RC:$src2, u8imm:$src3), - "vinsert" # From.EltTypeName # "x" # From.NumElts # - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}", - [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1), - (From.VT From.RC:$src2), - (iPTR imm)))]>, - EVEX_4V, EVEX_V512; + defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V; - let mayLoad = 1 in - def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3), - "vinsert" # From.EltTypeName # "x" # From.NumElts # - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}", - []>, - EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>; - } -} - -multiclass vinsert_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, - PatFrag vinsert_insert, - SDNodeXForm INSERT_get_vinsert_imm> : - vinsert_for_size_no_alt<Opcode, From, To, - vinsert_insert, INSERT_get_vinsert_imm> { - // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for - // vinserti32x4. Only add this if 64x2 and friends are not supported - // natively via AVX512DQ. - let Predicates = [NoDQI] in + let mayLoad = 1 in + defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<From.EltSize, From.CD8TupleForm>; + } +} + +multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> { + let Predicates = p in { def : Pat<(vinsert_insert:$ins - (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), - (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr") - VR512:$src1, From.RC:$src2, - (INSERT_get_vinsert_imm VR512:$ins)))>; + (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rm") + To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + } } multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, ValueType EltVT64, int Opcode256> { - defm NAME # "32x4" : vinsert_for_size<Opcode128, + + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + vinsert128_insert>, EVEX_V256; + + defm NAME # "32x4Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 4, EltVT32, VR128X>, X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 2, EltVT64, VR128X>, + vinsert128_insert>, EVEX_V512; + + defm NAME # "64x4Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert, - INSERT_get_vinsert128_imm>; - let Predicates = [HasDQI] in - defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128, + vinsert256_insert>, VEX_W, EVEX_V512; + + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vinsert128_insert>, VEX_W, EVEX_V256; + + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert, - INSERT_get_vinsert128_imm>, VEX_W; - defm NAME # "64x4" : vinsert_for_size<Opcode256, - X86VectorVTInfo< 4, EltVT64, VR256X>, - X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 8, EltVT32, VR256>, - X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert, - INSERT_get_vinsert256_imm>, VEX_W; - let Predicates = [HasDQI] in - defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256, - X86VectorVTInfo< 8, EltVT32, VR256X>, - X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert, - INSERT_get_vinsert256_imm>; + vinsert128_insert>, VEX_W, EVEX_V512; + + defm NAME # "32x8Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert>, EVEX_V512; + } } defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; +// Codegen pattern with the alternative types, +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + +// Codegen pattern with the alternative types insert VEC128 into VEC256 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +// Codegen pattern with the alternative types insert VEC128 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types insert VEC256 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), @@ -566,90 +623,158 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), // AVX-512 VECTOR EXTRACT //--- +multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From, + X86VectorVTInfo To> { + // A subvector extract from the first vector position is + // a subregister copy that needs no instruction. + def NAME # To.NumElts: + Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))), + (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>; +} + multiclass vextract_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, - PatFrag vextract_extract, - SDNodeXForm EXTRACT_get_vextract_imm> { + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vextract_extract> : + vextract_for_size_first_position_lowering<From, To> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to + // vextract_extract), we interesting only in patterns without mask, + // intrinsics pattern match generated bellow. defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), - (ins VR512:$src1, u8imm:$idx), - "vextract" # To.EltTypeName # "x4", + (ins From.RC:$src1, i32u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts, "$idx, $src1", "$src1, $idx", - [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), + [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)))]>, - AVX512AIi8Base, EVEX, EVEX_V512; - let mayStore = 1 in - def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), - (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2), - "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" - "$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; - } - - // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for - // vextracti32x4 - def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), - (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr") - VR512:$src1, - (EXTRACT_get_vextract_imm To.RC:$ext)))>; - - // A 128/256-bit subvector extract from the first 512-bit vector position is - // a subregister copy that needs no instruction. - def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), - (To.VT - (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; - - // And for the alternative types. - def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), - (AltTo.VT - (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + AVX512AIi8Base, EVEX; + let mayStore = 1 in { + def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX; + + def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, To.KRCWM:$mask, + From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst {${mask}}|" + "$dst {${mask}}, $src1, $src2}", + []>, EVEX_K, EVEX; + }//mayStore = 1 + } // Intrinsic call with masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0, - (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrk") + To.RC:$src0, + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; // Intrinsic call with zero-masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x4rrkz") - (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrkz") + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; // Intrinsic call without masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), - (!cast<Instruction>(NAME # To.EltSize # "x4rr") - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rr") + From.RC:$src1, imm:$idx)>; +} + +// Codegen pattern for the alternative types +multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> : + vextract_for_size_first_position_lowering<From, To> { + + let Predicates = p in + def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; } -multiclass vextract_for_type<ValueType EltVT32, int Opcode32, - ValueType EltVT64, int Opcode64> { - defm NAME # "32x4" : vextract_for_size<Opcode32, +multiclass vextract_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + defm NAME # "32x4Z" : vextract_for_size<Opcode128, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; + defm NAME # "64x4Z" : vextract_for_size<Opcode256, X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vextract256_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>; - defm NAME # "64x4" : vextract_for_size<Opcode64, + vextract128_extract>, + VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vextract_for_size<Opcode128, X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; + defm NAME # "32x8Z" : vextract_for_size<Opcode256, X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 8, EltVT32, VR256>, - vextract256_extract, - EXTRACT_get_vextract256_imm>, VEX_W; + X86VectorVTInfo< 8, EltVT32, VR256X>, + vextract256_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; + } } defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; +// extract_subvector codegen patterns with the alternative types. +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + +// Codegen pattern with the alternative types extract VEC128 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types extract VEC256 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; + // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), @@ -677,6 +802,10 @@ def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), @@ -694,50 +823,49 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, - ValueType svt, X86VectorVTInfo _> { - defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix), - "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>, - T8PD, EVEX; - let mayLoad = 1 in { - defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.ScalarMemOp:$src), - "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src", - (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>, - T8PD, EVEX; - } +multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { + + defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))>, + T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>; } -multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode, - AVX512VLVectorVTInfo _> { - defm Z : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>, +multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>, - EVEX_V256; + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; } } let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, - avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, "vbroadcastss", + avx512vl_f32_info>; let Predicates = [HasVLX] in { - defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, - v4f32, v4f32x_info>, EVEX_V128, - EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSSZ128 : avx512_broadcast_rm<0x18, "vbroadcastss", + v4f32x_info, v4f32x_info>, EVEX_V128; } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, "vbroadcastsd", + avx512vl_f64_info>, VEX_W; } // avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. -// Later, we can canonize broadcast instructions before ISel phase and +// Later, we can canonize broadcast instructions before ISel phase and // eliminate additional patterns on ISel. // SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar // representations of source @@ -834,70 +962,50 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; -multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, - RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, - RegisterClass KRC> { - def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; - def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, - VR128X:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, - VR128X:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, - x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, - x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, (OpVT (vselect KRC:$mask, - (X86VBroadcast (ld_frag addr:$src)), - (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ; - } -} - -defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, - loadi32, VR512, v16i32, v4i32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, - loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VT1>; +// Provide aliases for broadcast from the same register class that +// automatically does the extract. +multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> { + def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r") + (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>; +} + +multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + avx512_int_broadcast_rm_lowering<_.info512, _.info256>, + EVEX_V512; + // Defined separately to avoid redefinition. + defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>; + } + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + avx512_int_broadcast_rm_lowering<_.info256, _.info256>, + EVEX_V256; + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>, + EVEX_V128; + } +} + +defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", + avx512vl_i8_info, HasBWI>; +defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", + avx512vl_i16_info, HasBWI>; +defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", + avx512vl_i32_info, HasAVX512>; +defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", + avx512vl_i64_info, HasAVX512>, VEX_W; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _Dst.RC:$dst, - (_Dst.VT (X86SubVBroadcast - (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))))]>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, - _Src.MemOp:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, - _Src.MemOp:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - } + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + AVX5128IBase, EVEX; } defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", @@ -944,10 +1052,45 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", EVEX_V512, EVEX_CD8<32, CD8VT8>; } -def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), - (VPBROADCASTDZrr VR128X:$src)>; -def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), - (VPBROADCASTQZrr VR128X:$src)>; +multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src, + SDNode OpNode = X86SubVBroadcast> { + + defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode + (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>, + T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>; +} + +multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + let Predicates = [HasDQI] in + defm Z : avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>, + EVEX_V512; + let Predicates = [HasDQI, HasVLX] in + defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; +} + +multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> : + avx512_common_broadcast_32x2<opc, OpcodeStr, _> { + + let Predicates = [HasDQI, HasVLX] in + defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128, + X86SubV32x2Broadcast>, EVEX_V128; +} + +defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", + avx512vl_i32_info>; +defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", + avx512vl_f32_info>; def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; @@ -959,21 +1102,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; -def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), - (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v16i32 (X86VBroadcast (v8i32 VR256X:$src))), - (VPBROADCASTDZrr (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm))>; - -def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), - (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v8i64 (X86VBroadcast (v4i64 VR256X:$src))), - (VPBROADCASTQZrr (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm))>; - -def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZr VR128X:$src)>; -def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZr VR128X:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), @@ -985,170 +1113,178 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- - -multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass KRC> { -let Predicates = [HasCDI] in -def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V512; - -let Predicates = [HasCDI, HasVLX] in { -def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V128; -def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src), +multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass KRC> { + def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V256; + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; } + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { + let Predicates = [HasCDI] in + defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512; + let Predicates = [HasCDI, HasVLX] in { + defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256; + defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128; + } } -let Predicates = [HasCDI] in { defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", - VK16>; + avx512vl_i32_info, VK16>; defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", - VK8>, VEX_W; -} + avx512vl_i64_info, VK8>, VEX_W; //===----------------------------------------------------------------------===// -// AVX-512 - VPERM -// -// -- immediate form -- -multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { - def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, - EVEX; - def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.MemOp:$src1, u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (OpNode (_.LdFrag addr:$src1), - (i8 imm:$src2))))]>, - EVEX, EVEX_CD8<_.EltSize, CD8VF>; +// -- VPERMI2 - 3 source operands form -- +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst" in { + defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + AVX5128IBase; + + let mayLoad = 1 in + defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, + (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + EVEX_4V, AVX5128IBase; + } } +multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let mayLoad = 1, Constraints = "$src1 = $dst" in + defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermi2X IdxVT.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + AVX5128IBase, EVEX_4V, EVEX_B; } -multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, - X86VectorVTInfo Ctrl> : - avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> { - let ExeDomain = _.ExeDomain in { - def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), - !strconcat("vpermil" # _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT Ctrl.RC:$src2))))]>, - EVEX_4V; - def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, Ctrl.MemOp:$src2), - !strconcat("vpermil" # _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>, - EVEX_4V; - } -} -defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, - EVEX_V512; -defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, - EVEX_V512, VEX_W; - -def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - -// -- VPERM2I - 3 source operands form -- -multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { + let Predicates = [HasBWI] in + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2W : avx512_perm_i_sizes_w<0x75, "vpermi2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +// VPERMT2 +multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let Constraints = "$src1 = $dst" in { defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.RC:$src3), + (ins IdxVT.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V, AVX5128IBase; let mayLoad = 1 in defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.MemOp:$src3), + (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, + (bitconvert (_.LdFrag addr:$src3))))>, EVEX_4V, AVX5128IBase; } } -multiclass avx512_perm_3src_mb<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let mayLoad = 1, Constraints = "$src1 = $dst" in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.ScalarMemOp:$src3), + (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (_.VT (OpNode _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + (_.VT (X86VPermt2 _.RC:$src1, + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, AVX5128IBase, EVEX_4V, EVEX_B; } -multiclass avx512_perm_3src_sizes<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { - let Predicates = [HasAVX512] in - defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; +multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; - defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; } } -multiclass avx512_perm_3src_sizes_w<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { + +multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { let Predicates = [HasBWI] in - defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, - EVEX_V512; + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { - defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; - defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; - } -} -defm VPERMI2D : avx512_perm_3src_sizes<0x76, "vpermi2d", X86VPermiv3, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src_sizes<0x76, "vpermi2q", X86VPermiv3, - avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src_sizes<0x77, "vpermi2ps", X86VPermiv3, - avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src_sizes<0x77, "vpermi2pd", X86VPermiv3, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2D : avx512_perm_3src_sizes<0x7E, "vpermt2d", X86VPermv3, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_3src_sizes<0x7E, "vpermt2q", X86VPermv3, - avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_3src_sizes<0x7F, "vpermt2ps", X86VPermv3, - avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_3src_sizes<0x7F, "vpermt2pd", X86VPermv3, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2W : avx512_perm_3src_sizes_w<0x7D, "vpermt2w", X86VPermv3, - avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPERMI2W : avx512_perm_3src_sizes_w<0x75, "vpermi2w", X86VPermiv3, - avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2W : avx512_perm_t_sizes_w<0x7D, "vpermt2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask @@ -1265,41 +1401,85 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), //===----------------------------------------------------------------------===// // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - SDNode OpNode, ValueType VT, - PatFrag ld_frag, string Suffix> { - def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", Suffix, + +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{ + + defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V; + let mayLoad = 1 in + defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2,{sae}", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B; + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs VK1:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">, + EVEX_4V, EVEX_B; + }// let isAsmParserOnly = 1, hasSideEffects = 0 + + let isCodeGenOnly = 1 in { + def rr : AVX512Ii8<0xC2, MRMSrcReg, + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + _.FRC:$src2, + imm:$cc))], IIC_SSE_ALU_F32S_RR>, EVEX_4V; - def rm : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VK1:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), - !strconcat("vcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; let mayLoad = 1 in - def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), - !strconcat("vcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rm : AVX512Ii8<0xC2, MRMSrcMem, + (outs _.KRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; } } let Predicates = [HasAVX512] in { -defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">, - XS; -defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">, - XD, VEX_W; + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>, + AVX512XSIi8Base; + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>, + AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -1700,6 +1880,128 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; +// ---------------------------------------------------------------- +// FPClass +//handle fpclass instruction mask = op(reg_scalar,imm) +// op(mem_scalar,imm) +multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1, AddedComplexity = 20 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + } + } +} + +//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) +// fpclass(reg_vec, mem_vec, imm) +// fpclass(reg_vec, broadcast(eltVt), imm) +multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, string mem, string broadcast>{ + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst | $dst, ${src1}" + ##_.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>,EVEX_B; + def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"## + _.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, + EVEX_B, EVEX_K; + } +} + +multiclass avx512_vector_fpclass_all<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, + string broadcast>{ + let Predicates = [prd] in { + defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", + broadcast>, EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}", + broadcast>, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}", + broadcast>, EVEX_V256; + } +} + +multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, + bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ + defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, + VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>; + defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, + VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W; + defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f32x_info, prd>, EVEX_CD8<32, CD8VT1>; + defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W; +} + +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, + X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX; + //----------------------------------------------------------------- // Mask register copy, including // - copy between mask registers @@ -1786,6 +2088,11 @@ let Predicates = [HasDQI] in { (KMOVBmk addr:$dst, VK8:$src)>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (KMOVBkm addr:$src)>; + + def : Pat<(store VK4:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(store VK2:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>; } let Predicates = [HasAVX512, NoDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), @@ -1837,10 +2144,15 @@ let Predicates = [HasAVX512] in { (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; + def : Pat<(i8 (zext VK1:$src)), (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; + def : Pat<(i8 (anyext VK1:$src)), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>; + def : Pat<(i64 (zext VK1:$src)), (AND64ri8 (SUBREG_TO_REG (i64 0), (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; @@ -1848,17 +2160,19 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_16bit)>; - def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; - def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -} -let Predicates = [HasBWI] in { - def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; - def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; } +def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; +def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +def : Pat<(v4i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK4)>; +def : Pat<(v2i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK2)>; +def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; +def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. @@ -1955,11 +2269,12 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, } multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, bit IsCommutable> { + SDPatternOperator OpNode, bit IsCommutable, + Predicate prdW = HasAVX512> { defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, - HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS; + prdW, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, @@ -1974,6 +2289,7 @@ defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; multiclass avx512_mask_binop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -2047,59 +2363,48 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; // Mask unpacking -multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, - RegisterClass KRC> { - let Predicates = [HasAVX512] in - def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; -} +multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, + RegisterClass KRCSrc, Predicate prd> { + let Predicates = [prd] in { + def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), + (ins KRC:$src1, KRC:$src2), + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_L; -multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { - defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>, - VEX_4V, VEX_L, PD; + def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), + (!cast<Instruction>(NAME##rr) + (COPY_TO_REGCLASS KRCSrc:$src2, KRC), + (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + } } -defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; -def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))), - (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16), - (COPY_TO_REGCLASS VK8:$src1, VK16))>; - - -multiclass avx512_mask_unpck_int<string IntName, string InstName> { - let Predicates = [HasAVX512] in - def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw") - (i16 GR16:$src1), (i16 GR16:$src2)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), - (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; -} -defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode> { - let Predicates = [HasAVX512], Defs = [EFLAGS] in + SDNode OpNode, Predicate prd> { + let Predicates = [prd], Defs = [EFLAGS] in def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; } -multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, PS; - let Predicates = [HasDQI] in - defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>, - VEX, PD; - let Predicates = [HasBWI] in { - defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>, - VEX, PS, VEX_W; - defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>, - VEX, PD, VEX_W; - } +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>, + VEX, PD; + defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>, + VEX, PS; + defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>, + VEX, PD, VEX_W; } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>; // Mask shift multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, @@ -2124,7 +2429,7 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, let Predicates = [HasDQI] in defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, VEX, TAPD; - } + } } defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; @@ -2167,24 +2472,52 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; + +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), + (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; + def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; -let Predicates = [HasVLX] in { - def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), - (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; - def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), - (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; - def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), - (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; - def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), - (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; - def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), - (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; -} +def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + +def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; + +def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; + +def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; +def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + +def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>; + +def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>; + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS @@ -2304,23 +2637,21 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore> { - let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), - OpcodeStr # "\t{$src, $dst|$dst, $src}", [], - _.ExeDomain>, EVEX; - let Constraints = "$src1 = $dst" in - def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2), - OpcodeStr # - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}", + + def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # ".s\t{$src, $dst|$dst, $src}", + [], _.ExeDomain>, EVEX; + def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"# + "${dst} {${mask}}, $src}", [], _.ExeDomain>, EVEX, EVEX_K; - def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), - OpcodeStr # - "\t{$src, ${dst} {${mask}} {z}|" # + OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" # "${dst} {${mask}} {z}, $src}", [], _.ExeDomain>, EVEX, EVEX_KZ; - } + let mayStore = 1 in { def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -2425,22 +2756,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; -let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), - (VMOVUPSZmrk addr:$ptr, - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; - -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; - -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), - (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -} - defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, @@ -2502,17 +2817,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (v16i32 VR512:$src))), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } -// NoVLX patterns -let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), - (VMOVDQU32Zmrk addr:$ptr, - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; - -def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -} // Move Int Doubleword to Packed Double Int // @@ -2520,32 +2824,37 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, - EVEX, VEX_LIG; + EVEX; def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector GR64:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG; + IIC_SSE_MOVDQ>, EVEX, VEX_W; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, + EVEX, VEX_W, EVEX_CD8<64, CD8VT1>; let isCodeGenOnly = 1 in { -def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), +def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert GR64:$src))], + [(set FR64X:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; -def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), +def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bitconvert FR64:$src))], + [(set GR64:$dst, (bitconvert FR64X:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; -} -def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), +def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + [(store (i64 (bitconvert FR64X:$src)), addr:$dst)], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, EVEX_CD8<64, CD8VT1>; +} // Move Int Doubleword to Single Scalar // @@ -2553,27 +2862,27 @@ let isCodeGenOnly = 1 in { def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG; + IIC_SSE_MOVDQ>, EVEX; def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; } // Move doubleword from xmm register to r/m32 // def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, - EVEX, VEX_LIG; + EVEX; def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128X:$src), + [(store (i32 (extractelt (v4i32 VR128X:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + EVEX, EVEX_CD8<32, CD8VT1>; // Move quadword from xmm1 register to r/m64 // @@ -2581,16 +2890,28 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), (iPTR 0)))], - IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W, + IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Requires<[HasAVX512, In64BitMode]>; -def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs), - (ins i64mem:$dst, VR128X:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), - addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>, - Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>, + Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; + +let hasSideEffects = 0 in +def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq.s\t{$src, $dst|$dst, $src}",[]>, + EVEX, VEX_W; // Move Scalar Single to Double Int // @@ -2599,92 +2920,95 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))], - IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG; + IIC_SSE_MOVD_ToGP>, EVEX; def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; } // Move Quadword Int to Packed Quadword Int // -def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), +def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, - EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; //===----------------------------------------------------------------------===// // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar <string asm, RegisterClass RC, - SDNode OpNode, ValueType vt, - X86MemOperand x86memop, PatFrag mem_pat> { - let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, (vt (OpNode VR128X:$src1, - (scalar_to_vector RC:$src2))))], - IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; - let Constraints = "$src1 = $dst" in - def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3), - !strconcat(asm, - "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), - [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K; - def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, - EVEX, VEX_LIG; +multiclass avx512_move_scalar <string asm, SDNode OpNode, + X86VectorVTInfo _> { + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + asm, "$src2, $src1","$src1, $src2", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2))), + IIC_SSE_MOV_S_RR>, EVEX_4V; + let Constraints = "$src1 = $dst" , mayLoad = 1 in + defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, + (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), + asm,"$src","$src", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, EVEX; + let isCodeGenOnly = 1 in { + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.FRC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; + let mayLoad = 1 in + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + } let mayStore = 1 in { - def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, - EVEX, VEX_LIG; - def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), - !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), - [], IIC_SSE_MOV_S_MR>, - EVEX, VEX_LIG, EVEX_K; + def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, + EVEX; + def mrk: AVX512PI<0x11, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; } // mayStore - } //hasSideEffects = 0 } -let ExeDomain = SSEPackedSingle in -defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem, - loadf32>, XS, EVEX_CD8<32, CD8VT1>; +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, + VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; -let ExeDomain = SSEPackedDouble in -defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem, - loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, + VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), - (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), - VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>; + (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), - (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), - VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; + (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; -// For the disassembler -let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR32X:$src2), - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_MOV_S_RR>, - XS, EVEX_4V, VEX_LIG; - def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR64X:$src2), - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_MOV_S_RR>, - XD, EVEX_4V, VEX_LIG, VEX_W; -} +defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovss.s", "$src2, $src1", "$src1, $src2", []>, + XS, EVEX_4V, VEX_LIG; + +defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, + XD, EVEX_4V, VEX_LIG, VEX_W; let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { @@ -2768,10 +3092,10 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; - def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>; @@ -2835,7 +3159,7 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; -let AddedComplexity = 20 in +let AddedComplexity = 20 , isCodeGenOnly = 1 in def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -2964,7 +3288,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, OpndItins itins, bit IsCommutable = 0> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr, IsCommutable>, @@ -2972,7 +3296,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), @@ -2986,7 +3310,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, @@ -3058,20 +3382,20 @@ multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { - defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd, + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd, IsCommutable>; - defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd, + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd, IsCommutable>; } multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { - defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd, + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd, IsCommutable>; - defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd, + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd, IsCommutable>; } @@ -3086,15 +3410,15 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, } multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, - SDNode OpNode,X86VectorVTInfo _Src, + SDNode OpNode,X86VectorVTInfo _Src, X86VectorVTInfo _Dst, bit IsCommutable = 0> { - defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, - "$src2, $src1","$src1, $src2", - (_Dst.VT (OpNode - (_Src.VT _Src.RC:$src1), + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - itins.rr, IsCommutable>, + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), @@ -3106,12 +3430,12 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512BIBase, EVEX_4V; defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Dst.BroadcastStr##", $src1", "$src1, ${src2}"##_Dst.BroadcastStr, - (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Dst.VT (X86VBroadcast + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Dst.VT (X86VBroadcast (_Dst.ScalarLdFrag addr:$src2)))))), itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; @@ -3127,24 +3451,24 @@ defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, SSE_INTALU_ITINS_P, HasBWI, 0>; defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, - SSE_INTALU_ITINS_P, HasBWI, 1>; + SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, - SSE_INTALU_ITINS_P, HasBWI, 0>; -defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, - SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, - SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, - SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; -defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P, - HasBWI, 1>; -defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, + SSE_INTALU_ITINS_P, HasBWI, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; +defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, - HasBWI, 1>, T8PD; +defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P, + HasBWI, 1>; +defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P, + HasBWI, 1>, T8PD; defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, - SSE_INTALU_ITINS_P, HasBWI, 1>; - + SSE_INTALU_ITINS_P, HasBWI, 1>; + multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, SDNode OpNode, bit IsCommutable = 0> { @@ -3159,7 +3483,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, v4i32x_info, v2i64x_info, IsCommutable>, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } -} +} defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, X86pmuldq, 1>,T8PD; @@ -3170,25 +3494,25 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { let mayLoad = 1 in { defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, - (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Src.VT (X86VBroadcast (_Src.ScalarLdFrag addr:$src2))))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; } } -multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, - SDNode OpNode,X86VectorVTInfo _Src, +multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode,X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { - defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, - "$src2, $src1","$src1, $src2", - (_Dst.VT (OpNode - (_Src.VT _Src.RC:$src1), + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2)))>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; let mayLoad = 1 in { @@ -3229,102 +3553,59 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, v16i8x_info>, EVEX_V128; } } + +multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo _Src, + AVX512VLVectorVTInfo _Dst> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, + _Dst.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, + _Dst.info256>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, + _Dst.info128>, EVEX_V128; + } +} + let Predicates = [HasBWI] in { defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD; defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD; defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W; defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; + + defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; + defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase; } -defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax, +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax, +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax, +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax, +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin, +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin, +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin, +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin, +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; - -//===----------------------------------------------------------------------===// -// AVX-512 - Unpack Instructions -//===----------------------------------------------------------------------===// - -multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt, - PatFrag mem_frag, RegisterClass RC, - X86MemOperand x86memop, string asm, - Domain d> { - def rr : AVX512PI<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2), - asm, [(set RC:$dst, - (vt (OpNode RC:$src1, RC:$src2)))], - d>, EVEX_4V; - def rm : AVX512PI<opc, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - asm, [(set RC:$dst, - (vt (OpNode RC:$src1, - (bitconvert (mem_frag addr:$src2)))))], - d>, EVEX_4V; -} - -defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64, - VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64, - VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64, - VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64, - VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], - IIC_SSE_UNPCK>, EVEX_4V; - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))], - IIC_SSE_UNPCK>, EVEX_4V; -} -defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, - VR512, loadv16i32, i512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, - VR512, loadv8i64, i512mem>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; -defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, - VR512, loadv16i32, i512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, - VR512, loadv8i64, i512mem>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// @@ -3362,12 +3643,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, let isCodeGenOnly = 1, isCommutable = IsCommutable, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.FRC:$src2), + (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], itins.rr>; def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.ScalarMemOp:$src2), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))], itins.rr>; @@ -3375,7 +3656,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, } multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - SDNode VecNode, OpndItins itins, bit IsCommutable> { + SDNode VecNode, OpndItins itins, bit IsCommutable = 0> { defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, @@ -3470,7 +3751,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, EVEX_4V, EVEX_B; } -multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, IsCommutable>, EVEX_V512, PS, @@ -3514,7 +3795,7 @@ defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; @@ -3550,13 +3831,34 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, }//let mayLoad = 1 } -multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, +multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>; + let mayLoad = 1 in { + defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>; + }//let mayLoad = 1 +} + +multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>, + EVEX_4V,EVEX_CD8<32, CD8VT1>; + defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>, + EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>, @@ -3569,7 +3871,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, "vscalef", X86scalef>, T8PD; +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions @@ -3586,7 +3888,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), + (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; @@ -3748,12 +4050,12 @@ multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, VTInfo.info256>, EVEX_V256; defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, VTInfo.info128>, - avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, VTInfo.info128>, EVEX_V128; } } -multiclass avx512_shift_rmi_w<bits<8> opcw, +multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -3846,6 +4148,27 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, avx512vl_i64_info>, VEX_W; } +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> { + let Predicates = [HasBWI, NoVLX] in { + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + (_.info256.VT _.info256.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + (_.info128.VT _.info128.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; + } +} + multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -3861,11 +4184,14 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, } defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>; + avx512_var_shift_w<0x12, "vpsllvw", shl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, shl>; defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>; + avx512_var_shift_w<0x11, "vpsravw", sra>, + avx512_var_shift_w_lowering<avx512vl_i16_info, sra>; defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>; + avx512_var_shift_w<0x10, "vpsrlvw", srl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, srl>; defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; @@ -3916,19 +4242,77 @@ defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", X86VPermi, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - VPERMIL +//===----------------------------------------------------------------------===// +multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo Ctrl> { + defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2)))>, + T8PD, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (X86VBroadcast + (Ctrl.ScalarLdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + }//let mayLoad = 1 +} + +multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512, + Ctrl.info512>, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128, + Ctrl.info128>, EVEX_V128; + defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256, + Ctrl.info256>, EVEX_V256; + } +} + +multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + + defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>; + defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr, + X86VPermilpi, _>, + EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; +} + +defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, + avx512vl_i32_info>; +defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, + avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW //===----------------------------------------------------------------------===// defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", - X86PShufd, avx512vl_i32_info>, + X86PShufd, avx512vl_i32_info>, EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", - X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W; + X86PShufhw>, EVEX, AVX512XSIi8Base; defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", - X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W; - + X86PShuflw>, EVEX, AVX512XDIi8Base; + multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512; @@ -3942,55 +4326,6 @@ multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; //===----------------------------------------------------------------------===// -// AVX-512 - MOVDDUP -//===----------------------------------------------------------------------===// - -multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, - X86MemOperand x86memop, PatFrag memop_frag> { -def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; -def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, - (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; -} - -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPZrm addr:$src)>; - -//===---------------------------------------------------------------------===// -// Replicate Single FP - MOVSHDUP and MOVSLDUP -//===---------------------------------------------------------------------===// -multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, - ValueType vt, RegisterClass RC, PatFrag mem_frag, - X86MemOperand x86memop> { - def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX; - let mayLoad = 1 in - def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX; -} - -defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))), - (VMOVSHDUPZrm addr:$src)>; -def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))), - (VMOVSLDUPZrm addr:$src)>; - -//===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), @@ -4017,6 +4352,115 @@ let Predicates = [HasAVX512] in { } //===----------------------------------------------------------------------===// +// VMOVHPS/PD VMOVLPS Instructions +// All patterns was taken from SSS implementation. +//===----------------------------------------------------------------------===// +multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, + (_.VT (bitconvert + (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))], + IIC_SSE_MOV_LH>, EVEX_4V; +} + +defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; +defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; + +let Predicates = [HasAVX512] in { + // VMOVHPS patterns + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVHPD patterns + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPS patterns + def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPD patterns + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; +} + +let mayStore = 1 in { +def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), + (bc_v2f64 (v4f32 VR128X:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128X:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +} +let Predicates = [HasAVX512] in { + // VMOVHPD patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86VPermilpi VR128X:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDZ128mr addr:$dst, VR128X:$src)>; + // VMOVLPS patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + // VMOVLPD patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; +} +//===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // @@ -4034,7 +4478,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, - AVX512FMA3Base; + AVX512FMA3Base; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -4435,50 +4879,55 @@ def : Pat<(f64 (uint_to_fp GR64:$src)), //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - Intrinsic Int, Operand memop, ComplexPattern mem_cpat, - string asm> { -let hasSideEffects = 0 in { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG, - Requires<[HasAVX512]>; - let mayLoad = 1 in - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, - Requires<[HasAVX512]>; -} // hasSideEffects = 0 +multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat, string asm> { + let hasSideEffects = 0, Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; + def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, + EVEX, VEX_LIG, EVEX_B, EVEX_RC; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG; + } // hasSideEffects = 0, Predicates = [HasAVX512] } -let Predicates = [HasAVX512] in { + // Convert float/double to signed/unsigned int 32/64 -defm VCVTSS2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, +defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtss2usi, ssmem, sse_load_f32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, +defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, int_x86_avx512_cvtss2usi64, ssmem, sse_load_f32, "cvtss2usi">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSD2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, +defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtsd2usi, sdmem, sse_load_f64, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, +defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, int_x86_avx512_cvtsd2usi64, sdmem, sse_load_f64, "cvtsd2usi">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1 , Predicates = [HasAVX512] in { defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", SSE_CVT_Scalar, 0>, XS, EVEX_4V; @@ -4495,121 +4944,170 @@ let isCodeGenOnly = 1 in { defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}", SSE_CVT_Scalar, 0>, XD, EVEX_4V; -} // isCodeGenOnly = 1 +} // isCodeGenOnly = 1, Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation -let isCodeGenOnly = 1 in { - defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si, - ssmem, sse_load_f32, "cvttss2si">, - XS, EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, - int_x86_sse_cvttss2si64, ssmem, sse_load_f32, - "cvttss2si">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si, - sdmem, sse_load_f64, "cvttsd2si">, XD, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, - int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, - "cvttsd2si">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, - int_x86_avx512_cvttss2usi, ssmem, sse_load_f32, - "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, - int_x86_avx512_cvttss2usi64, ssmem, - sse_load_f32, "cvttss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, - int_x86_avx512_cvttsd2usi, - sdmem, sse_load_f64, "cvttsd2usi">, XD, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, - int_x86_avx512_cvttsd2usi64, sdmem, - sse_load_f64, "cvttsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; -} // isCodeGenOnly = 1 - -multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), +multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, + SDNode OpNodeRnd>{ +let Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX; - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX; + def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + []>, EVEX, EVEX_B; + def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX; -} - -defm VCVTTSS2SIZ : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem, - loadf32, "cvttss2si">, XS, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USIZ : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem, - loadf32, "cvttss2usi">, XS, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem, - loadf32, "cvttss2si">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem, - loadf32, "cvttss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem, - loadf64, "cvttsd2si">, XD, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USIZ : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem, - loadf64, "cvttsd2usi">, XD, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem, - loadf64, "cvttsd2si">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem, - loadf64, "cvttsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, + EVEX; + + let isCodeGenOnly = 1,hasSideEffects = 0 in { + def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; + def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, + EVEX,VEX_LIG , EVEX_B; + let mayLoad = 1 in + def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), + (ins _SrcRC.MemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + []>, EVEX, VEX_LIG; + + } // isCodeGenOnly = 1, hasSideEffects = 0 +} //HasAVX512 +} + + +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, + VEX_W, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, + fp_to_sint,X86cvttsd2IntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, + VEX_W, XD, EVEX_CD8<64, CD8VT1>; + +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS,VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +let Predicates = [HasAVX512] in { + def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), + (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), + (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), + (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), + (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + } // HasAVX512 //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// -let hasSideEffects = 0 in { -def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst), - (ins FR32X:$src1, FR32X:$src2), - "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; -let mayLoad = 1 in -def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst), - (ins FR32X:$src1, f32mem:$src2), - "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, - EVEX_CD8<32, CD8VT1>; - -// Convert scalar double to scalar single -def VCVTSD2SSZrr : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst), - (ins FR64X:$src1, FR64X:$src2), - "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>; -let mayLoad = 1 in -def VCVTSD2SSZrm : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst), - (ins FR64X:$src1, f64mem:$src2), - "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, VEX_W, - Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>; -} - -def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>, - Requires<[HasAVX512]>; -def : Pat<(fextend (loadf32 addr:$src)), - (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; - -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, +multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode> { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +// Scalar Coversion with SAE - suppress all exceptions +multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), + (i32 FROUND_NO_EXC)))>, + EVEX_4V, VEX_LIG, EVEX_B; +} + +// Scalar Conversion with rounding control (RC) +multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + EVEX_B, EVEX_RC; +} +multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, + OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, + EVEX_V512, XD; + } +} + +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, + EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; + } +} +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, + X86froundRnd, f64x_info, f32x_info>; +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, + X86fpextRnd,f32x_info, f64x_info >; + +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), + (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +def : Pat<(f64 (fextend (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[HasAVX512]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[HasAVX512, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, - Requires<[HasAVX512, OptForSpeed]>; +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), + (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; - //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer @@ -4992,7 +5490,7 @@ defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; -let Predicates = [NoVLX] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; @@ -5024,40 +5522,102 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC, - X86MemOperand x86memop> { - def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", - []>, EVEX; - let hasSideEffects = 0, mayLoad = 1 in - def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX; -} - -multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC, - X86MemOperand x86memop> { - def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), - (ins srcRC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX; - let hasSideEffects = 0, mayStore = 1 in - def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; +multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, PatFrag ld_frag> { + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_CURRENT))>, T8PD; + let hasSideEffects = 0, mayLoad = 1 in { + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), + (i32 FROUND_CURRENT))>, T8PD; + } } -defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512, - EVEX_CD8<32, CD8VH>; -defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512, - EVEX_CD8<32, CD8VH>; +multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "{sae}, $src", "$src, {sae}", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; -def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src), - imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))), - (VCVTPS2PHZrr VR512:$src, imm:$rc)>; +} -def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src), - (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))), - (VCVTPH2PSZrr VR256X:$src)>; +let Predicates = [HasAVX512] in { + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>, + avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop> { + defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, $src1", "$src1, $src2", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, AVX512AIi8Base; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), (i32 FROUND_CURRENT) )), + addr:$dst)]>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + []>, EVEX_K; + } +} +multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; +} +let Predicates = [HasAVX512] in { + defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>, + avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode, + string OpcodeStr> { + def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), + [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, + (i32 FROUND_NO_EXC)))], + IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[WriteFAdd]>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; +} let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, @@ -5067,10 +5627,10 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { "ucomisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; let Pattern = []<dag> in { - defm VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load, + defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, "comiss">, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load, + defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, "comisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } @@ -5092,50 +5652,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { } /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd -multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0 in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; +multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V; let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V; } } } -defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; - -def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; - -def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; +defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; +defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5183,20 +5724,6 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; -def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRSQRT14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRSQRT14PDZr VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRCP14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRCP14PDZr VR512:$src)>; - /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode> { @@ -5232,6 +5759,8 @@ let hasSideEffects = 0, Predicates = [HasERI] in { defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; } + +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, @@ -5322,67 +5851,6 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, } } -multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, - Intrinsic F32Int, Intrinsic F64Int, - OpndItins itins_s, OpndItins itins_d> { - def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst), - (ins FR32X:$src1, FR32X:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], itins_s.rr>, XS, EVEX_4V; - let isCodeGenOnly = 1 in - def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F32Int VR128X:$src1, VR128X:$src2))], - itins_s.rr>, XS, EVEX_4V; - let mayLoad = 1 in { - def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst), - (ins FR32X:$src1, f32mem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - let isCodeGenOnly = 1 in - def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, ssmem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F32Int VR128X:$src1, sse_load_f32:$src2))], - itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - } - def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst), - (ins FR64X:$src1, FR64X:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XD, EVEX_4V, VEX_W; - let isCodeGenOnly = 1 in - def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, VR128X:$src2))], - itins_s.rr>, XD, EVEX_4V, VEX_W; - let mayLoad = 1 in { - def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst), - (ins FR64X:$src1, f64mem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - let isCodeGenOnly = 1 in - def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, sdmem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - } -} - multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, @@ -5416,93 +5884,77 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; } +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + string SUFF, SDNode OpNode, SDNode OpNodeRnd> { + + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; + + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$rc))>, + EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + } + + def : Pat<(_.EltVT (OpNode _.FRC:$src)), + (!cast<Instruction>(NAME#SUFF#Zr) + (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + + def : Pat<(_.EltVT (OpNode (load addr:$src))), + (!cast<Instruction>(NAME#SUFF#Zm) + (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>; +} + +multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> { + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt, + X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt, + X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; +} + defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>; +defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; let Predicates = [HasAVX512] in { - def : Pat<(f32 (fsqrt FR32X:$src)), - (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; - def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f64 (fsqrt FR64X:$src)), - (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; - def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frsqrt FR32X:$src)), - (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[OptForSize]>; - def : Pat<(f32 (X86frcp FR32X:$src)), - (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[OptForSize]>; - - def : Pat<(int_x86_sse_sqrt_ss VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR32)), - VR128X)>; - def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), - (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR64)), - VR128X)>; - def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), - (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; -} - - -multiclass avx512_rndscale<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag, Domain d> { -let ExeDomain = d in { - // Intrinsic operation, reg. - // Vector intrinsic operation, reg - def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX; - - // Vector intrinsic operation, mem - def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX; -} // ExeDomain } -defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, - loadv16f32, SSEPackedSingle>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), - imm:$src2, (v16f32 VR512:$src1), (i16 -1), - FROUND_CURRENT)), - (VRNDSCALEPSZr VR512:$src1, imm:$src2)>; - - -defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, - loadv8f64, SSEPackedDouble>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - -def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), - imm:$src2, (v8f64 VR512:$src1), (i8 -1), - FROUND_CURRENT)), - (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; - multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { @@ -5510,20 +5962,20 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), (i32 FROUND_CURRENT)))>; defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, - "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}", - (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; let mayLoad = 1 in defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScale (_.VT _.RC:$src1), + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 imm:$src3), (i32 FROUND_CURRENT)))>; } @@ -5568,109 +6020,238 @@ defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; -let Predicates = [HasAVX512] in { -def : Pat<(v16f32 (ffloor VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x1))>; -def : Pat<(v16f32 (fnearbyint VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0xC))>; -def : Pat<(v16f32 (fceil VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x2))>; -def : Pat<(v16f32 (frint VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x4))>; -def : Pat<(v16f32 (ftrunc VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x3))>; - -def : Pat<(v8f64 (ffloor VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x1))>; -def : Pat<(v8f64 (fnearbyint VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0xC))>; -def : Pat<(v8f64 (fceil VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x2))>; -def : Pat<(v8f64 (frint VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x4))>; -def : Pat<(v8f64 (ftrunc VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x3))>; -} //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- -multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, - RegisterClass dstRC, RegisterClass srcRC, - RegisterClass KRC, X86MemOperand x86memop> { - def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins srcRC:$src), - !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), +multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, + X86MemOperand x86memop> { + + defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, + EVEX, T8XS; + + // for intrinsic patter match + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + undef)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.RC:$src0)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, + DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + let mayStore = 1 in { + def mr : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst |$dst, $src}", []>, EVEX; - def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + def mrk : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", []>, EVEX, EVEX_K; + }//mayStore = 1 +} - def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; +multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, + PatFrag truncFrag, PatFrag mtruncFrag > { - def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX; + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; - def mrk : AVX512XS8I<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, string sat > { + + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, + (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), + (SrcInfo.VT SrcInfo.RC:$src))>; + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, + (SrcInfo.VT SrcInfo.RC:$src))>; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; - -def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>; -def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>; -def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>; -def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; -def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; - -def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>; -def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; +multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, + Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + truncFrag, mtruncFrag>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + truncFrag, mtruncFrag>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ, + truncFrag, mtruncFrag>, EVEX_V512; +} + +multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + sat>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + sat>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ, + sat>, EVEX_V512; +} + +multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>; +} +multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + sat>, EVEX_CD8<8, CD8VO>; +} + +multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>; +} +multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<32, CD8VH>; +} + +multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>; +} +multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<16, CD8VH>; +} + +multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + sat, HasBWI>, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; +defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; +defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; +defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; +defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; +defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; +defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; +defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; +defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; +defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; +defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; +defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; +defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; + +let Predicates = [HasAVX512, NoVLX] in { +def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))), + (v8i16 (EXTRACT_SUBREG + (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))), + (v4i32 (EXTRACT_SUBREG + (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +} + +let Predicates = [HasBWI, NoVLX] in { +def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), + (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm))), sub_xmm))>; +} multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, @@ -5985,163 +6566,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -//===----------------------------------------------------------------------===// -// VSHUFPS - VSHUFPD Operations - -multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, - ValueType vt, string OpcodeStr, PatFrag mem_frag, - Domain d> { - def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), - (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, - EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; - def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, - EVEX_4V, Sched<[WriteShuffle]>; -} - -defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32, - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64, - SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; -def : Pat<(v16i32 (X86Shufp VR512:$src1, - (loadv16i32 addr:$src2), (i8 imm:$imm))), - (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; - -def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; -def : Pat<(v8i64 (X86Shufp VR512:$src1, - (loadv8i64 addr:$src2), (i8 imm:$imm))), - (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; -multiclass avx512_conflict<bits<8> opc, string OpcodeStr, - RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, - X86MemOperand x86scalar_mop, string BrdcstStr> { - let hasSideEffects = 0 in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), - []>, EVEX; - let mayLoad = 1 in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), - []>, EVEX; - let mayLoad = 1 in - def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", ${dst}|${dst}, ${src}", BrdcstStr, "}"), - []>, EVEX, EVEX_B; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in - def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in - def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}", - BrdcstStr, "}"), - []>, EVEX, EVEX_KZ, EVEX_B; - - let Constraints = "$src1 = $dst" in { - def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; - let mayLoad = 1 in - def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; - let mayLoad = 1 in - def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), - !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, - ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), - []>, EVEX, EVEX_K, EVEX_B; - } - } -} - -let Predicates = [HasCDI] in { -defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM, - i512mem, i32mem, "{1to16}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - - -defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM, - i512mem, i64mem, "{1to8}">, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -} - -def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1, - GR16:$mask), - (VPCONFLICTDrrk VR512:$src1, - (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; - -def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, - GR8:$mask), - (VPCONFLICTQrrk VR512:$src1, - (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; - -let Predicates = [HasCDI] in { -defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM, - i512mem, i32mem, "{1to16}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - - -defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM, - i512mem, i64mem, "{1to8}">, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -} - -def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1, - GR16:$mask), - (VPLZCNTDrrk VR512:$src1, - (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; - -def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, - GR8:$mask), - (VPLZCNTQrrk VR512:$src1, - (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; - -def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))), - (VPLZCNTDrm addr:$src)>; -def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), - (VPLZCNTDrr VR512:$src)>; -def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))), - (VPLZCNTQrm addr:$src)>; -def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), - (VPLZCNTQrr VR512:$src)>; - def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; @@ -6197,7 +6626,7 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX; + [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, @@ -6230,7 +6659,7 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86compress _.RC:$src1))>, AVX5128IBase; let mayStore = 1 in { @@ -6242,7 +6671,7 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(store (_.VT (vselect _.KRCWM:$mask, + [(store (_.VT (vselect _.KRCWM:$mask, (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), addr:$dst)]>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; @@ -6272,7 +6701,7 @@ defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86expand _.RC:$src1))>, AVX5128IBase; let mayLoad = 1 in @@ -6302,6 +6731,62 @@ defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, EVEX, VEX_W; +//handle instruction reg_vec1 = op(reg_vec,imm) +// op(mem_vec,imm) +// op(broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in { + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr##", $src2", + (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, EVEX_B; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2,{sae}, $src1", + "$src1, {sae}, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256; + } +} + //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) @@ -6309,49 +6794,60 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let mayLoad = 1 in { defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>, EVEX_B; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) +multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ + + defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT SrcInfo.RC:$src2), + (i8 imm:$src3)))>; + let mayLoad = 1 in + defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT (bitconvert + (SrcInfo.LdFrag addr:$src2))), + (i8 imm:$src3)))>; +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (i8 imm:$src3))>; - let mayLoad = 1 in { - defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i8 imm:$src3))>; + X86VectorVTInfo _>: + avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + + let mayLoad = 1 in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", @@ -6359,7 +6855,6 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), (i8 imm:$src3))>, EVEX_B; - } } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) @@ -6369,20 +6864,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let mayLoad = 1 in { defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let isAsmParserOnly = 1 in { @@ -6398,18 +6893,25 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3,{sae}, $src2, $src1", "$src1, $src2,{sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_NO_EXC))>, EVEX_B; } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { - defm NAME: avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _>; + defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3,{sae}, $src2, $src1", + "$src1, $src2,{sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, EVEX_B; } multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, @@ -6428,6 +6930,20 @@ multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, } } +multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr, + AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{ + let Predicates = [HasBWI] in { + defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512, + SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V; + } + let Predicates = [HasBWI, HasVLX] in { + defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128, + SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V; + defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256, + SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V; + } +} + multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode>{ let Predicates = [HasAVX512] in { @@ -6447,6 +6963,14 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, } } +multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, + bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{ + defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, + opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>; + defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, + opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W; +} + defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd", avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; @@ -6461,6 +6985,14 @@ defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info, 0x55, X86VFixupimm, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, + X86VReduce, HasDQI>, AVX512AIi8Base, EVEX; +defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, + X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX; +defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, + X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX; + + defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, 0x50, X86VRange, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; @@ -6475,6 +7007,19 @@ defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, 0x51, X86VRange, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode = X86Shuf128>{ @@ -6486,6 +7031,29 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; } } +let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (ffloor VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>; +def : Pat<(v16f32 (fnearbyint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; +def : Pat<(v16f32 (fceil VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>; +def : Pat<(v16f32 (frint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; +def : Pat<(v16f32 (ftrunc VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>; + +def : Pat<(v8f64 (ffloor VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>; +def : Pat<(v8f64 (fnearbyint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; +def : Pat<(v8f64 (fceil VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>; +def : Pat<(v8f64 (frint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; +def : Pat<(v8f64 (ftrunc VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>; +} defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; @@ -6496,31 +7064,51 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; -multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, - AVX512VLVectorVTInfo VTInfo_FP>{ +multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> { defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>, AVX512AIi8Base, EVEX_4V; - let isCodeGenOnly = 1 in { - defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>, - AVX512AIi8Base, EVEX_4V; - } } -defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>, +defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>, +defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{ + let Predicates = p in + def NAME#_.VTName#rri: + Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast<Instruction>(NAME#_.ZSuffix#rri) + _.RC:$src1, _.RC:$src2, imm:$imm)>; +} + +multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>: + avx512_vpalign_lowering<_.info512, [HasBWI]>, + avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>, + avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>; + +defm VPALIGN: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , + avx512vl_i8_info, avx512vl_i8_info>, + avx512_vpalign_lowering_common<avx512vl_i16_info>, + avx512_vpalign_lowering_common<avx512vl_i32_info>, + avx512_vpalign_lowering_common<avx512vl_f32_info>, + avx512_vpalign_lowering_common<avx512vl_i64_info>, + avx512_vpalign_lowering_common<avx512vl_f64_info>, + EVEX_CD8<8, CD8VF>; + +defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , + avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; + multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr##_.Suffix, + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase; let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.MemOp:$src1), OpcodeStr##_.Suffix, + (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; @@ -6531,7 +7119,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_unary_rm<opc, OpcodeStr, OpNode, _> { let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.ScalarMemOp:$src1), OpcodeStr##_.Suffix, + (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, (_.VT (OpNode (X86VBroadcast @@ -6568,15 +7156,16 @@ multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr, OpNode, avx512vl_i64_info, + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info, prd>, VEX_W; - defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr, OpNode, avx512vl_i32_info, prd>; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info, + prd>; } multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm W : avx512_unary_rm_vl<opc_w, OpcodeStr, OpNode, avx512vl_i16_info, prd>; - defm B : avx512_unary_rm_vl<opc_b, OpcodeStr, OpNode, avx512vl_i8_info, prd>; + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>; } multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, @@ -6598,3 +7187,332 @@ def : Pat<(xor (bc_v8i64 (v8i1sextv8i64)), (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), (VPABSQZrr VR512:$src)>; + +multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ + + defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>; +} + +defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; +defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; + +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info, + HasAVX512>, XS; +} + +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX; + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, + EVEX, EVEX_CD8<_.EltSize, CD8VH>; +} + +multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, + avx512vl_f64_info>, XD, VEX_W; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; + +def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>; +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>; + +defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; + +defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Extract & Insert Integer Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayStore = 1 in + def mr : AVX512Ii8<opc, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1), + imm:$src2)))), + addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; + } +} + +multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, PD; + + def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; + } +} + +multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _, + RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GRC:$dst, + (extractelt (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + let mayStore = 1 in + def mr : AVX512Ii8<0x16, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (_.VT _.RC:$src1), + imm:$src2),addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD; + } +} + +defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>; +defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>; +defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; +defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; + +multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>; + } +} + +multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GRC:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, + EVEX_4V, TAPD; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, + _.ScalarLdFrag>, TAPD; + } +} + +defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info, + extloadi8>, TAPD; +defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, + extloadi16>, PD; +defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; +defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations +//===----------------------------------------------------------------------===// +multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, + AVX512VLVectorVTInfo VTInfo_FP>{ + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>, + EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, + AVX512AIi8Base, EVEX_4V; +} + +defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS; +defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - Byte shift Left/Right +//===----------------------------------------------------------------------===// + +multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + def rr : AVX512<opc, MRMr, + (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512<opc, MRMm, + (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode + (_.LdFrag addr:$src1), (i8 imm:$src2))))]>; +} + +multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, Predicate prd>{ + let Predicates = [prd] in + defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v8i64_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v4i64x_info>, EVEX_V256; + defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v2i64x_info>, EVEX_V128; + } +} +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; + + +multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, + string OpcodeStr, X86VectorVTInfo _dst, + X86VectorVTInfo _src>{ + def rr : AVX512BI<opc, MRMSrcReg, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512BI<opc, MRMSrcMem, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT (bitconvert + (_src.LdFrag addr:$src2))))))]>; +} + +multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, + string OpcodeStr, Predicate prd> { + let Predicates = [prd] in + defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info, + v64i8_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info, + v32i8x_info>, EVEX_V256; + defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info, + v16i8x_info>, EVEX_V128; + } +} + +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", + HasBWI>, EVEX_4V; + +multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst" in { + defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT _.RC:$src3), + (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V; + let mayLoad = 1 in { + defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (bitconvert (_.LdFrag addr:$src3))), + (i8 imm:$src4))>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr##", $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (i8 imm:$src4))>, EVEX_B, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + } + }// Constraints = "$src1 = $dst" +} + +multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{ + let Predicates = [HasAVX512] in + defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128; + defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256; + } +} + +defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>; +defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; + diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index 5e19ad448fc7..1a2e786661e9 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -615,14 +615,14 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; -def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem , - Imm8 , i8imm , imm, i8imm , invalid_node, +def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, + Imm8, i8imm, imm8_su, i8imm, invalid_node, 0, OpSizeFixed, 0>; def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, - Imm16, i16imm, imm, i16i8imm, i16immSExt8, + Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su, 1, OpSize16, 0>; def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, - Imm32, i32imm, imm, i32i8imm, i32immSExt8, + Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, 1, OpSize32, 0>; def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, @@ -928,15 +928,22 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let hasSideEffects = 0; } -// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define +// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define // and use EFLAGS. -class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - Register areg, string operands> +class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, IIC_BIN_CARRY_NONMEM> { let Uses = [areg, EFLAGS]; } +// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> { + let Defs = [EFLAGS]; +} + /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is /// defined with "(set GPR:$dst, EFLAGS, (...". /// @@ -1092,14 +1099,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Uses = [EFLAGS], Defs = [EFLAGS] - def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">; - def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">; - def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1170,14 +1177,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Defs = [EFLAGS] - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } @@ -1246,14 +1253,14 @@ let isCompare = 1 in { "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; } // Defs = [EFLAGS] - def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, - "{$src, %al|al, $src}">; - def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, - "{$src, %ax|ax, $src}">; - def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, - "{$src, %eax|eax, $src}">; - def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, - "{$src, %rax|rax, $src}">; + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, + "{$src, %al|al, $src}">; + def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, + "{$src, %ax|ax, $src}">; + def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, + "{$src, %eax|eax, $src}">; + def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, + "{$src, %rax|rax, $src}">; } // isCompare //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index 2056056d23a5..787f15bc628e 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -156,10 +156,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { Flags |= MachineMemOperand::MOLoad; if (MCID.mayStore()) Flags |= MachineMemOperand::MOStore; - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset), - Flags, MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); return addOffset(MIB.addFrameIndex(FI), Offset) .addMemOperand(MMO); } diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index 315f21308c0d..c73c95019f8d 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// -// SetCC instructions. +// CMOV instructions. multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", isCommutable = 1, SchedRW = [WriteALU] in { diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 7f850d6830e1..5d7283f7bd57 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -132,26 +132,6 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), Requires<[In64BitMode]>; } -// The MSVC runtime contains an _ftol2 routine for converting floating-point -// to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is -// used as a temporary register. No other registers (aside from flags) are -// touched. -// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 -// variant is unnecessary. - -let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { - def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), - "# win32 fptoui", - [(X86WinFTOL RFP32:$src)]>, - Requires<[Not64BitMode]>; - - def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), - "# win32 fptoui", - [(X86WinFTOL RFP64:$src)]>, - Requires<[Not64BitMode]>; -} - //===----------------------------------------------------------------------===// // EH Pseudo Instructions // @@ -172,6 +152,29 @@ def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), } +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1 in { + def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>; + + // CATCHRET needs a custom inserter for SEH. + let usesCustomInserter = 1 in + def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from), + "# CATCHRET", + [(catchret bb:$dst, bb:$from)]>; +} + +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>; + +// This instruction is responsible for re-establishing stack pointers after an +// exception has been caught and we are rejoining normal control flow in the +// parent function or funclet. It generally sets ESP and EBP, and optionally +// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us +// elsewhere. +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in +def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>; + let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), @@ -247,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1 in + isPseudo = 1, AddedComplexity = 20 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; @@ -259,6 +262,33 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { let AddedComplexity = 20; } +let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], + AddedComplexity = 15 in { + // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, + // which only require 3 bytes compared to MOV32ri which requires 5. + let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { + def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 1)]>; + def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, -1)]>; + } + + // MOV16ri is 4 bytes, so the instructions above are smaller. + def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; + def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; +} + +let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in { +// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. +// FIXME: Add itinerary class and Schedule. +def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", + [(set GR32:$dst, i32immSExt8:$src)]>, + Requires<[OptForMinSize]>; +def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", + [(set GR64:$dst, i64immSExt8:$src)]>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; +} + // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. @@ -268,9 +298,9 @@ def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; // This 64-bit pseudo-move can be used for both a 64-bit constant that is -// actually the zero-extension of a 32-bit constant, and for labels in the +// actually the zero-extension of a 32-bit constant and for labels in the // x86-64 small code model. -def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; +def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>; let AddedComplexity = 1 in def : Pat<(i64 mov64imm32:$src), @@ -509,6 +539,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in { defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _FR128 : CMOVrr_PSEUDO<FR128, f128>; defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; @@ -752,67 +783,111 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) */ -multiclass RELEASE_BINOP_MI<string op> { +multiclass RELEASE_BINOP_MI<SDNode op> { def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (op (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), GR8:$src))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (op (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), GR32:$src))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, (op (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), GR64:$src))]>; +} +let Defs = [EFLAGS] in { + defm RELEASE_ADD : RELEASE_BINOP_MI<add>; + defm RELEASE_AND : RELEASE_BINOP_MI<and>; + defm RELEASE_OR : RELEASE_BINOP_MI<or>; + defm RELEASE_XOR : RELEASE_BINOP_MI<xor>; + // Note: we don't deal with sub, because substractions of constants are + // optimized into additions before this code can run. +} + +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1 in { +multiclass RELEASE_FP_BINOP_MI<SDNode op> { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (op + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (op + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; +// FIXME: Add fsub, fmul, fdiv, ... } -defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; -defm RELEASE_AND : RELEASE_BINOP_MI<"and">; -defm RELEASE_OR : RELEASE_BINOP_MI<"or">; -defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; -// Note: we don't deal with sub, because substractions of constants are -// optimized into additions before this code can run multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"8m PSEUDO!", [(atomic_store_8 addr:$dst, dag8)]>; def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"16m PSEUDO!", [(atomic_store_16 addr:$dst, dag16)]>; def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"32m PSEUDO!", [(atomic_store_32 addr:$dst, dag32)]>; def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"64m PSEUDO!", [(atomic_store_64 addr:$dst, dag64)]>; } -defm RELEASE_INC : RELEASE_UNOP< - (add (atomic_load_8 addr:$dst), (i8 1)), - (add (atomic_load_16 addr:$dst), (i16 1)), - (add (atomic_load_32 addr:$dst), (i32 1)), - (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; -defm RELEASE_DEC : RELEASE_UNOP< - (add (atomic_load_8 addr:$dst), (i8 -1)), - (add (atomic_load_16 addr:$dst), (i16 -1)), - (add (atomic_load_32 addr:$dst), (i32 -1)), - (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +let Defs = [EFLAGS] in { + defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; + defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +} /* TODO: These don't work because the type inference of TableGen fails. TODO: find a way to fix it. -defm RELEASE_NEG : RELEASE_UNOP< - (ineg (atomic_load_8 addr:$dst)), - (ineg (atomic_load_16 addr:$dst)), - (ineg (atomic_load_32 addr:$dst)), - (ineg (atomic_load_64 addr:$dst))>; +let Defs = [EFLAGS] in { + defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +} +// NOT doesn't set flags. defm RELEASE_NOT : RELEASE_UNOP< (not (atomic_load_8 addr:$dst)), (not (atomic_load_16 addr:$dst)), @@ -821,42 +896,42 @@ defm RELEASE_NOT : RELEASE_UNOP< */ def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV8mi PSEUDO!", [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV16mi PSEUDO!", [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV32mi PSEUDO!", [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV8mr PSEUDO!", [(atomic_store_8 addr:$dst, GR8 :$src)]>; def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV16mr PSEUDO!", [(atomic_store_16 addr:$dst, GR16:$src)]>; def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV32mr PSEUDO!", [(atomic_store_32 addr:$dst, GR32:$src)]>; def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV64mr PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV8rm PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV16rm PSEUDO!", [(set GR16:$dst, (atomic_load_16 addr:$src))]>; def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV32rm PSEUDO!", [(set GR32:$dst, (atomic_load_32 addr:$src))]>; def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV64rm PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// @@ -1077,11 +1152,11 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; // zextload bool -> zextload byte def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>; -def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>; -def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>; +def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>; +def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>; def : Pat<(zextloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), - (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; + (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit @@ -1298,7 +1373,6 @@ def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), sub_32bit)>; // r & (2^16-1) ==> movz -let AddedComplexity = 1 in // Give priority over i64immZExt32. def : Pat<(and GR64:$src, 0xffff), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 4cd5563ce727..8c351a51c460 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -53,6 +53,19 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>; def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16; + + // The machine return from interrupt instruction, but sometimes we need to + // perform a post-epilogue stack adjustment. Codegen emits the pseudo form + // which expands to include an SP adjustment if necessary. + def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, + OpSize16; + def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], + IIC_IRET>, OpSize32; + def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [], + IIC_IRET>, Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in + def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>; + } // Unconditional branches. diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 7cc3b599a737..fd800cf077f7 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -15,13 +15,31 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// -let Constraints = "$src1 = $dst" in { +// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined +// below, both the register and memory variants are commutable. +// For the register form the commutable operands are 1, 2 and 3. +// For the memory variant the folded operand must be in 3. Thus, +// in that case, only the operands 1 and 2 can be swapped. +// Commuting some of operands may require the opcode change. +// FMA*213*: +// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); +// operands 1 and 3 (register forms only): *213* --> *231*; +// operands 2 and 3 (register forms only): *213* --> *132*. +// FMA*132*: +// operands 1 and 2 (memory & register forms): *132* --> *231*; +// operands 1 and 3 (register forms only): *132* --> *132*(no changes); +// operands 2 and 3 (register forms only): *132* --> *213*. +// FMA*231*: +// operands 1 and 2 (memory & register forms): *231* --> *132*; +// operands 1 and 3 (register forms only): *231* --> *213*; +// operands 2 and 3 (register forms only): *231* --> *231*(no changes). + +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, - bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, SDPatternOperator Op = null_frag> { - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -29,7 +47,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, VR128:$src3)))]>; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, @@ -37,7 +55,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -45,7 +63,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, VR256:$src3)))]>, VEX_L; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, @@ -54,34 +72,20 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, (OpVT256 (Op VR256:$src2, VR256:$src1, (MemFrag256 addr:$src3))))]>, VEX_L; } -} // Constraints = "$src1 = $dst" multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - // For 213, both the register and memory variant are commutable. - // Indeed, the commutable operands are 1 and 2 and both live in registers - // for both variants. defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 1, - Op>; -let hasSideEffects = 0 in { + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, "132", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256>; - // For 231, only the register variant is commutable. - // For the memory variant the folded operand must be in 3. Thus, - // in that case, it cannot be swapped with 2. defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, "231", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 0>; -} // hasSideEffects = 0 + MemFrag128, MemFrag256, OpTy128, OpTy256>; } // Fused Multiply-Add @@ -126,83 +130,122 @@ let ExeDomain = SSEPackedDouble in { v4f64>, VEX_W; } -let Constraints = "$src1 = $dst" in { -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, - RegisterClass RC, ValueType OpVT, PatFrag mem_frag, - bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, +// All source register operands of FMA opcodes defined in fma3s_rm multiclass +// can be commuted. In many cases such commute transformation requres an opcode +// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form +// would require an opcode change to FMA*231: +// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; +// --> +// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; +// Please see more detailed comment at the very beginning of the section +// defining FMA3 opcodes above. +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, SDPatternOperator OpNode = null_frag> { - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, - (mem_frag addr:$src3))))]>; + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; +} + +// These FMA*_Int instructions are defined specially for being used when +// the scalar FMA intrinsics are lowered to machine instructions, and in that +// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. +// instructions. +// +// All of the FMA*_Int opcodes are defined as commutable here. +// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial +// and the corresponding optimizations have been developed. +// Commuting the 1st operand of FMA*_Int requires some additional analysis, +// the commute optimization is legal only if all users of FMA*_Int use only +// the lowest element of the FMA*_Int instruction. Even though such analysis +// may be not implemented yet we allow the routines doing the actual commute +// transformation to decide if one or another instruction is commutable or not. +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, + Operand memopr, RegisterClass RC> { + def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; + + let mayLoad = 1 in + def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, memopr:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; } -} // Constraints = "$src1 = $dst" multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, string PT2, Intrinsic Int, - SDNode OpNode, RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, PatFrag mem_frag, - ComplexPattern mem_cpat> { -let hasSideEffects = 0 in { - defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), - x86memop, RC, OpVT, mem_frag>; - // See the other defm of r231 for the explanation regarding the - // commutable flags. - defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), - x86memop, RC, OpVT, mem_frag, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 0>; + string OpStr, string PackTy, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop> { + defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>; + defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC, + OpNode>; + defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>; } -// See the other defm of r213 for the explanation regarding the -// commutable flags. -defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), - x86memop, RC, OpVT, mem_frag, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 1, - OpNode>; +// The FMA 213 form is created for lowering of scalar FMA intrinscis +// to machine instructions. +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands +// of FMA 213 form. +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132 +// forms and is possible only after special analysis of all uses of the initial +// instruction. Such analysis do not exist yet and thus introducing the 231 +// form of FMA*_Int instructions is done using an optimistic assumption that +// such analysis will be implemented eventually. +multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, + RegisterClass RC, Operand memop> { + defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), + memop, RC>; + defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, RC>; + defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), + memop, RC>; } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, Intrinsic IntF32, Intrinsic IntF64, SDNode OpNode> { - defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode, - FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>; - defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode, - FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W; + let ExeDomain = SSEPackedSingle in + defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode, + FR32, f32mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>; + + let ExeDomain = SSEPackedDouble in + defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode, + FR64, f64mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>, + VEX_W; -// These patterns use the 123 ordering, instead of 213, even though -// they match the intrinsic to the 213 version of the instruction. -// This is because src1 is tied to dest, and the scalar intrinsics -// require the pass-through values to come from the first source -// operand, not the second. + // These patterns use the 123 ordering, instead of 213, even though + // they match the intrinsic to the 213 version of the instruction. + // This is because src1 is tied to dest, and the scalar intrinsics + // require the pass-through values to come from the first source + // operand, not the second. def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SSr213r") - (COPY_TO_REGCLASS $src1, FR32), - (COPY_TO_REGCLASS $src2, FR32), - (COPY_TO_REGCLASS $src3, FR32)), - VR128)>; + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") + $src1, $src2, $src3), VR128)>; def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SDr213r") - (COPY_TO_REGCLASS $src1, FR64), - (COPY_TO_REGCLASS $src2, FR64), - (COPY_TO_REGCLASS $src3, FR64)), - VR128)>; + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") + $src1, $src2, $src3), VR128)>; } defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, @@ -334,36 +377,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { } // isCodeGenOnly = 1 } -defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, - fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; -defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, - fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; -defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, - fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; -defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, - fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; -defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, - X86Fnmadd, loadf32>, - fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; -defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, - X86Fnmadd, loadf64>, - fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; -defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, - X86Fnmsub, loadf32>, - fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; -defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, - X86Fnmsub, loadf64>, - fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; - let ExeDomain = SSEPackedSingle in { + // Scalar Instructions + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; + // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, loadv4f32, loadv8f32>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, @@ -379,6 +409,22 @@ let ExeDomain = SSEPackedSingle in { } let ExeDomain = SSEPackedDouble in { + // Scalar Instructions + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, loadv2f64, loadv4f64>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 49068e9c37d3..03ae21125b0e 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // The FopST0 series are not included here because of the irregularities // in where the 'r' goes in assembly output. // These instructions cannot address 80-bit memory. -multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring, + bit Forward = 1> { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, - (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i16)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i32)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{l}\t$src")>; } let Defs = [FPSW] in { @@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>; let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary<fadd, MRM0m, "add">; defm SUB : FPBinary<fsub, MRM4m, "sub">; -defm SUBR: FPBinary<fsub ,MRM5m, "subr">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; } let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary<fmul, MRM1m, "mul">; } let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary<fdiv, MRM6m, "div">; -defm DIVR: FPBinary<fdiv, MRM7m, "divr">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; } } @@ -306,13 +336,13 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; -def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">; def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; -def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">; -def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">; +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">; // Floating point cmovs. class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : @@ -633,16 +663,18 @@ def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; -def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; -def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], - IIC_FXSAVE>, TB, Requires<[In64BitMode]>; -def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; -def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], - IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +let Predicates = [HasFXSR] in { + def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; + def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[In64BitMode]>; + def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; + def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +} // Predicates = [FeatureFXSR] } // SchedRW //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 1f61ffa84e9a..829cedd55fb3 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,6 +38,8 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>, SDTCisVec<1>]>; +def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -58,13 +60,17 @@ def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86frsqrt14s: SDNode<"X86ISD::FRSQRT", SDTFPBinOp>; +def X86frcp14s : SDNode<"X86ISD::FRCP", SDTFPBinOp>; def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; //def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", @@ -74,11 +80,18 @@ def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD", SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86psadbw : SDNode<"X86ISD::PSADBW", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>, SDTCisInt<3>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -86,9 +99,11 @@ def X86psign : SDNode<"X86ISD::PSIGN", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>, + SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>, + SDTCisPtrTy<2>]>>; def X86pinsrb : SDNode<"X86ISD::PINSRB", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; @@ -114,19 +129,17 @@ def X86vsext : SDNode<"X86ISD::VSEXT", SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>]>>; -def X86vtrunc : SDNode<"X86ISD::VTRUNC", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<0, 1>]>>; +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + def X86trunc : SDNode<"X86ISD::TRUNC", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>>; - -def X86vtruncm : SDNode<"X86ISD::VTRUNCM", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisVec<2>, SDTCisInt<2>, - SDTCisOpSmallerThanOp<0, 2>]>>; def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, @@ -136,6 +149,35 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>]>>; +def X86fround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>]>>; +def X86froundRnd: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisInt<3>]>>; + +def X86fpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def X86fpextRnd : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisInt<3>]>>; + def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; @@ -159,10 +201,15 @@ def X86CmpMaskCCRound : def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; -def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; -def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86CmpMaskCCScalarRound : + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, + SDTCisInt<4>]>; + +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; +def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>; def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -178,6 +225,29 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86vprot : SDNode<"X86ISD::VPROT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vproti : SDNode<"X86ISD::VPROTI", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>]>>; + +def X86vpshl : SDNode<"X86ISD::VPSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vpsha : SDNode<"X86ISD::VPSHA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; + +def X86vpcom : SDNode<"X86ISD::VPCOM", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; +def X86vpcomu : SDNode<"X86ISD::VPCOMU", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; + def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; @@ -190,6 +260,7 @@ def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>, @@ -201,11 +272,15 @@ def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>, def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<1,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; def X86pmuldq : SDNode<"X86ISD::PMULDQ", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<1,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; def X86extrqi : SDNode<"X86ISD::EXTRQI", SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, @@ -221,24 +296,30 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI", def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; -def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVec<2>]>; + SDTCisSameSizeAs<0,2>, + SDTCisSameNumEltsAs<0,2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisSameAs<0,1>, SDTCisInt<2>]>; + SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisInt<3>]>; + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>; +def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisInt<2>, SDTCisInt<3>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; -def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; +def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, SDTCisInt<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; +def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, + SDTCisVT<4, i8>]>; + def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>; @@ -250,15 +331,17 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>; def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<2>]>; + SDTCisVec<0>, SDTCisVT<2, i32>]>; def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<3>]>; + SDTCisVec<0>, SDTCisVT<3, i32>]>; def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>; + SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; -def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; + +def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; +def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; @@ -281,33 +364,74 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; -def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>; def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; + def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; -def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; +def X86VPermv : SDNode<"X86ISD::VPERMV", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; -def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; -def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; +def X86VPermt2 : SDNode<"X86ISD::VPERMV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>, + SDTCisSameSizeAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86VPermi2X : SDNode<"X86ISD::VPERMIV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; -def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; -def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; +def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>; +def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>; +def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>; +def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisFP<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisVT<2, i32>]>, []>; +def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", + SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSubVecOfVec<1, 0>]>, []>; +// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2. +def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisSameAs<0,1>]>, []>; + def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; + [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, + SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; + [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; @@ -317,11 +441,13 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; -def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; -def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; -def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>; +def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; @@ -341,9 +467,11 @@ def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; -def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; -def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; -def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; +def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>; +def X86Reduces : SDNode<"X86ISD::VREDUCE", STDFp3SrcRm>; +def X86GetMants : SDNode<"X86ISD::VGETMANT", STDFp3SrcRm>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -362,7 +490,8 @@ def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, - SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVT<3, i32>]>; def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; @@ -371,9 +500,12 @@ def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, + SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>; def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; - +def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, + SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>; def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCVecEltisVT<1, i32>, SDTCisInt<2>]>; @@ -392,6 +524,10 @@ def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; +def X86cvttss2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>; +def X86cvttss2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>; +def X86cvttsd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSDoubleToIntRnd>; +def X86cvttsd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSDoubleToIntRnd>; // Vector with rounding mode // cvtt fp-to-int staff @@ -417,17 +553,35 @@ def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>; def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>; def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>; +def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>, + SDTCisFP<0>, + SDTCisVT<2, i32>]> >; + +def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisFP<1>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>]> >; def X86vfpextRnd : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, SDTCisOpSmallerThanOp<1, 0>, - SDTCisInt<2>]>>; + SDTCisVT<2, i32>]>>; def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, - SDTCisInt<2>]>>; + SDTCisOpSmallerThanOp<0, 1>, + SDTCisVT<2, i32>]>>; + +def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -436,10 +590,10 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", // These are 'extloads' from a scalar to the low element of a vector, zeroing // the top elements. These are used for the SSE 'ss' and 'sd' instruction // forms. -def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [], +def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot]>; -def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [], +def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot]>; @@ -490,9 +644,9 @@ def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; // The memory operand is required to be a 128-bit load, so it must be converted // from a vector to a scalar. def loadf32_128 : PatFrag<(ops node:$ptr), - (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>; + (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>; def loadf64_128 : PatFrag<(ops node:$ptr), - (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>; + (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>; // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -590,9 +744,9 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; // The memory operand is required to be a 128-bit load, so it must be converted // from a vector to a scalar. def memopfsf32_128 : PatFrag<(ops node:$ptr), - (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>; + (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>; def memopfsf64_128 : PatFrag<(ops node:$ptr), - (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>; + (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a @@ -604,32 +758,6 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; -// MOVNT Support -// Like 'store', but requires the non-temporal bit to be set -def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal(); - return false; -}]>; - -def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal() && !ST->isTruncatingStore() && - ST->getAddressingMode() == ISD::UNINDEXED && - ST->getAlignment() >= 16; - return false; -}]>; - -def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal() && - ST->getAlignment() < 16; - return false; -}]>; - def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) @@ -851,29 +979,59 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), return isa<MaskedLoadSDNode>(N); }]>; +// masked store fragments. +// X86mstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; + def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 16; return false; }]>; def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 32; return false; }]>; def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 64; return false; }]>; def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ return isa<MaskedStoreSDNode>(N); }]>; +// masked truncstore fragments +// X86mtruncstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index cf68ef053361..63e78de69bc9 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -101,9 +102,11 @@ struct X86MemoryFoldTableEntry { void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) - : X86GenInstrInfo( - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32), + X86::CATCHRET), Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { @@ -332,6 +335,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, + { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, + { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, + { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, @@ -495,7 +501,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, @@ -605,7 +610,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, @@ -1647,6 +1651,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, + // ADX foldable instructions + { X86::ADCX32rr, X86::ADCX32rm, 0 }, + { X86::ADCX64rr, X86::ADCX64rm, 0 }, + { X86::ADOX32rr, X86::ADOX32rm, 0 }, + { X86::ADOX64rr, X86::ADOX64rm, 0 }, + // AVX-512 foldable instructions { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, @@ -1729,11 +1739,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, @@ -1749,11 +1765,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, @@ -1769,11 +1791,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, @@ -1789,11 +1817,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, @@ -2282,7 +2316,35 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: { + case X86::FsMOVAPDrm: + // AVX-512 + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVAPDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI->getOperand(1+X86::AddrBaseReg).isReg() && MI->getOperand(1+X86::AddrScaleAmt).isImm() && @@ -2363,9 +2425,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, // It is safe to clobber EFLAGS at the end of a block of no successor has it // live in. if (Iter == E) { - for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), - SE = MBB.succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *S : MBB.successors()) + if (S->isLiveIn(X86::EFLAGS)) return false; return true; } @@ -2411,13 +2472,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - // MOV32r0 is implemented with a xor which clobbers condition code. - // Re-materialize it as movri instructions to avoid side effects. - unsigned Opc = Orig->getOpcode(); - if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + bool ClobbersEFLAGS = false; + for (const MachineOperand &MO : Orig->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + ClobbersEFLAGS = true; + break; + } + } + + if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { + // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side + // effects. + int Value; + switch (Orig->getOpcode()) { + case X86::MOV32r0: Value = 0; break; + case X86::MOV32r1: Value = 1; break; + case X86::MOV32r_1: Value = -1; break; + default: + llvm_unreachable("Unexpected instruction!"); + } + DebugLoc DL = Orig->getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) - .addImm(0); + .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); @@ -2428,7 +2505,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, } /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. -static bool hasLiveCondCodeDef(MachineInstr *MI) { +bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef() && @@ -2453,7 +2530,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI, inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address // instructions if we can encode them appropriately. - // A LEA instruction utilizes a SIB byte to encode it's scale factor. + // A LEA instruction utilizes a SIB byte to encode its scale factor. // The SIB.scale field is two bits wide which means that we can encode any // shift amount less than 4. return ShAmt < 4 && ShAmt > 0; @@ -2493,7 +2570,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, ImplicitOp = Src; ImplicitOp.setImplicit(); - NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); + NewSrc = getX86SubSuperRegister(Src.getReg(), 64); MachineBasicBlock::LivenessQueryResult LQR = MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); @@ -2914,10 +2991,162 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// We have a few instructions that must be hacked on to commute them. -/// -MachineInstr * -X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +/// The second parameter is optional and is used as the second return from +/// the function. It is set to true if the given instruction has FMA3 opcode +/// that is used for lowering of scalar FMA intrinsics, and it is set to false +/// otherwise. +static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { + if (IsIntrinsic) + *IsIntrinsic = false; + + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + + case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: + case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: + case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: + case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: + case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: + case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: + + case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: + case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: + case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: + case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: + case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: + case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: + + case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: + case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: + case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: + case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: + case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: + case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + if (IsIntrinsic) + *IsIntrinsic = true; + return true; + default: + return false; + } + llvm_unreachable("Opcode not handled by the switch"); +} + +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { switch (MI->getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) @@ -2944,7 +3173,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -2980,7 +3209,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ @@ -2995,7 +3224,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMPPDrri: case X86::CMPPSrri: @@ -3016,7 +3245,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI = MF.CloneMachineInstr(MI); NewMI = false; } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); default: return nullptr; } @@ -3045,7 +3274,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -3124,11 +3353,272 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Fallthrough intended. } default: - return TargetInstrInfo::commuteInstruction(MI, NewMI); + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) + return nullptr; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } +} + +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // Only the first RegOpsNum operands are commutable. + // Also, the value 'CommuteAnyOperandIndex' is valid here as it means + // that the operand is not specified/fixed. + if (SrcOpIdx1 != CommuteAnyOperandIndex && + (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + return false; + if (SrcOpIdx2 != CommuteAnyOperandIndex && + (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + return false; + + // Look for two different register operands assumed to be commutable + // regardless of the FMA opcode. The FMA opcode is adjusted later. + if (SrcOpIdx1 == CommuteAnyOperandIndex || + SrcOpIdx2 == CommuteAnyOperandIndex) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + if (SrcOpIdx1 == SrcOpIdx2) + // Both of operands are not fixed. By default set one of commutable + // operands to the last register operand of the instruction. + CommutableOpIdx2 = RegOpsNum; + else if (SrcOpIdx2 == CommuteAnyOperandIndex) + // Only one of operands is not fixed. + CommutableOpIdx2 = SrcOpIdx1; + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + unsigned Opc = MI->getOpcode(); + + // Define the array that holds FMA opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned RegularOpcodeGroups[][3] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } + }; + + // Define the array that holds FMA*_Int opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned IntrinOpcodeGroups[][3] = { + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, + { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, + { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, + + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, + { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, + { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, + + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, + { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, + { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, + + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, + { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, + { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + }; + + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + const unsigned FormsNum = 3; + + bool IsIntrinOpcode; + isFMA3(Opc, &IsIntrinOpcode); + + size_t GroupsNum; + const unsigned (*OpcodeGroups)[3]; + if (IsIntrinOpcode) { + GroupsNum = array_lengthof(IntrinOpcodeGroups); + OpcodeGroups = IntrinOpcodeGroups; + } else { + GroupsNum = array_lengthof(RegularOpcodeGroups); + OpcodeGroups = RegularOpcodeGroups; + } + + const unsigned *FoundOpcodesGroup = nullptr; + size_t FormIndex; + + // Look for the input opcode in the corresponding opcodes table. + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { + if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = OpcodeGroups[GroupIndex]; + break; + } + } } + + // The input opcode does not match with any of the opcodes from the tables. + // The unsupported FMA opcode must be added to one of the two opcode groups + // defined above. + assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (IsIntrinOpcode && SrcOpIdx1 == 1) + return 0; + + unsigned Case; + if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + Case = 0; + else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) + Case = 1; + else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) + Case = 2; + else + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FoundOpcodesGroup[FormIndex]; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { case X86::CMPPDrri: @@ -3141,46 +3631,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI->getOperand(3).getImm() & 0x7; switch (Imm) { - case 0x00: // EQUAL - case 0x03: // UNORDERED - case 0x04: // NOT EQUAL - case 0x07: // ORDERED - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); } return false; } - case X86::VFMADDPDr231r: - case X86::VFMADDPSr231r: - case X86::VFMADDSDr231r: - case X86::VFMADDSSr231r: - case X86::VFMSUBPDr231r: - case X86::VFMSUBPSr231r: - case X86::VFMSUBSDr231r: - case X86::VFMSUBSSr231r: - case X86::VFNMADDPDr231r: - case X86::VFNMADDPSr231r: - case X86::VFNMADDSDr231r: - case X86::VFNMADDSSr231r: - case X86::VFNMSUBPDr231r: - case X86::VFNMSUBPSr231r: - case X86::VFNMSUBSDr231r: - case X86::VFNMSUBSSr231r: - case X86::VFMADDPDr231rY: - case X86::VFMADDPSr231rY: - case X86::VFMSUBPDr231rY: - case X86::VFMSUBPSr231rY: - case X86::VFNMADDPDr231rY: - case X86::VFNMADDPSr231rY: - case X86::VFNMSUBPDr231rY: - case X86::VFNMSUBPSr231rY: - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; default: + if (isFMA3(MI->getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } + return false; } static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { @@ -3821,15 +4287,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -inline static bool MaskRegClassContains(unsigned Reg) { +static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } + +static bool GRRegClassContains(unsigned Reg) { + return X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg) || + X86::GR16RegClass.contains(Reg) || + X86::GR8RegClass.contains(Reg); +} +static +unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVBrk; + } + if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVBkr; + } + return 0; +} + static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { +unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) + return X86::KMOVQkk; + if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) + return X86::KMOVDrk; + if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) + return X86::KMOVQrk; + if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + return X86::KMOVDkr; + if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) + return X86::KMOVQkr; + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, + const X86Subtarget &Subtarget) +{ + if (Subtarget.hasDQI()) + if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) + return Opc; + if (Subtarget.hasBWI()) + if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) + return Opc; if (X86::VR128XRegClass.contains(DestReg, SrcReg) || X86::VR256XRegClass.contains(DestReg, SrcReg) || X86::VR512RegClass.contains(DestReg, SrcReg)) { @@ -3837,21 +4346,14 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if (MaskRegClassContains(DestReg) && - MaskRegClassContains(SrcReg)) + if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && - (X86::GR32RegClass.contains(SrcReg) || - X86::GR16RegClass.contains(SrcReg) || - X86::GR8RegClass.contains(SrcReg))) { - SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); + if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); return X86::KMOVWkr; } - if ((X86::GR32RegClass.contains(DestReg) || - X86::GR16RegClass.contains(DestReg) || - X86::GR8RegClass.contains(DestReg)) && - MaskRegClassContains(SrcReg)) { - DestReg = getX86SubSuperRegister(DestReg, MVT::i32); + if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); return X86::KMOVWrk; } return 0; @@ -3886,7 +4388,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) @@ -3900,34 +4402,86 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - // Moving EFLAGS to / from another register requires a push and a pop. - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - clobbersTheStack. - if (SrcReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF64)); - BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(Subtarget.is64Bit() && + "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } return; } - if (X86::GR32RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF32)); - BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); - return; + + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using a + // signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH + // using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure it + // isn't clobbered by PUSH/POP'ing it before and after saving/restoring + // the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags are + // actually live inside EFLAGS. Were we able to do a single SETcc instead of + // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + + + bool AXDead = (Reg == AX) || + (MachineBasicBlock::LQR_Dead == + MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); + if (!AXDead) { + // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may + // actually be dead. This is not a problem for correctness as we are just + // (unnecessarily) saving+restoring a dead register. However the + // MachineVerifier expects operands that read from dead registers + // to be marked with the "undef" flag. + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); } - } - if (DestReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH64r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF64)); - return; + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); } - if (X86::GR32RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH32r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF32)); - return; + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) @@ -4602,9 +5156,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { MachineBasicBlock *MBB = CmpInstr->getParent(); - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *Successor : MBB->successors()) + if (Successor->isLiveIn(X86::EFLAGS)) return false; } @@ -4645,8 +5198,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. - for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) - OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); + for (auto &Op : OpsToUpdate) + Op.first->setDesc(get(Op.second)); return true; } @@ -4694,8 +5247,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); - if (FoldMI) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -4725,6 +5277,82 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two %k0 reads. +/// This is used for mapping: +/// %k4 = K_SET1 +/// to: +/// %k4 = KXNORrr %k0, %k0 +static bool Expand2AddrKreg(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc, unsigned Reg) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + MIB->setDesc(Desc); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + return true; +} + +static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, + bool MinusOne) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + + // Insert the XOR. + BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Turn the pseudo into an INC or DEC. + MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); + MIB.addReg(Reg); + + return true; +} + +bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + int64_t Imm = MIB->getOperand(1).getImm(); + assert(Imm != 0 && "Using push/pop for 0 is not efficient."); + MachineBasicBlock::iterator I = MIB.getInstr(); + + int StackAdjustment; + + if (Subtarget.is64Bit()) { + assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || + MIB->getOpcode() == X86::MOV32ImmSExti8); + // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and + // widen the register if necessary. + StackAdjustment = 8; + BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(get(X86::POP64r)); + MIB->getOperand(0) + .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); + } else { + assert(MIB->getOpcode() == X86::MOV32ImmSExti8); + StackAdjustment = 4; + BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(get(X86::POP32r)); + } + + // Build CFI if necessary. + MachineFunction &MF = *MBB.getParent(); + const X86FrameLowering *TFL = Subtarget.getFrameLowering(); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsDwarfCFI = + !IsWin64Prologue && + (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; + if (EmitCFI) { + TFL->BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); + TFL->BuildCFI(MBB, std::next(I), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); + } + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -4735,8 +5363,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; - MachineMemOperand *MMO = MBB.getParent()-> - getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) @@ -4753,6 +5381,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); + case X86::MOV32r1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); + case X86::MOV32r_1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); + case X86::MOV32ImmSExti8: + case X86::MOV64ImmSExti8: + return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -4777,10 +5412,22 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + + // KNL does not recognize dependency-breaking idioms for mask registers, + // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. + // Using %k0 as the undef input register is a performance heuristic based + // on the assumption that %k0 is used less frequently than the other mask + // registers, since it is not usable as a write mask. + // FIXME: A more advanced approach would be to choose the best input mask + // register based on context. case X86::KSET0B: - case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); + case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); + case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); case X86::KSET1B: - case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); + case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); + case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; @@ -4788,12 +5435,28 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) { +static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, + int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); - if (NumAddrOps < 4) // FrameIndex only - addOffset(MIB, 0); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + MIB.addDisp(MO, PtrOffset); + } else { + MIB.addOperand(MO); + } + } + } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, @@ -4828,7 +5491,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); @@ -4838,7 +5502,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs); + addOperands(MIB, MOs, PtrOffset); } else { MIB.addOperand(MO); } @@ -4860,6 +5524,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, @@ -4869,10 +5567,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; - // For CPUs that favor the register form of a call, - // do not fold loads into calls. - if (isCallRegIndirect && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) + // For CPUs that favor the register form of a call or push, + // do not fold loads into calls or pushes, unless optimizing for size + // aggressively. + if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r || + MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r || + MI->getOpcode() == X86::PUSH64r)) return nullptr; unsigned NumOps = MI->getDesc().getNumOperands(); @@ -4886,6 +5587,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (MachineInstr *CustomMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return CustomMI; + // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. @@ -4963,60 +5670,56 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); - bool Tied0 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. - if ((HasDef && Reg0 == Reg1 && Tied0) || - (HasDef && Reg0 == Reg2 && Tied1)) + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) return nullptr; - if ((CommuteOpIdx1 == OriginalOpIdx) || - (CommuteOpIdx2 == OriginalOpIdx)) { - MachineInstr *CommutedMI = commuteInstruction(MI, false); - if (!CommutedMI) { - // Unable to commute. - return nullptr; - } - if (CommutedMI != MI) { - // New instruction. We can't fold from this. - CommutedMI->eraseFromParent(); - return nullptr; - } + MachineInstr *CommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } - // Attempt to fold with the commuted version of the instruction. - unsigned CommuteOp = - (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); - NewMI = - foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align, - /*AllowCommute=*/false); - if (NewMI) - return NewMI; - - // Folding failed again - undo the commute before returning. - MachineInstr *UncommutedMI = commuteInstruction(MI, false); - if (!UncommutedMI) { - // Unable to commute. - return nullptr; - } - if (UncommutedMI != MI) { - // New instruction. It doesn't need to be kept. - UncommutedMI->eraseFromParent(); - return nullptr; - } + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; - // Return here to prevent duplicate fuse failure report. + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. return nullptr; } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; } } @@ -5208,13 +5911,14 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, // If MI kills this register, the false dependence is already broken. if (MI->killsRegister(Reg, TRI)) return; + if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. - bool HasAVX = Subtarget.hasAVX(); - unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + MI->addRegisterKilled(Reg, TRI, true); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. @@ -5222,21 +5926,20 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); - } else - return; - MI->addRegisterKilled(Reg, TRI, true); + MI->addRegisterKilled(Reg, TRI, true); + } } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex) const { // Check switch flag - if (NoFusing) return nullptr; + if (NoFusing) + return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -5303,6 +6006,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: + case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: + case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: return false; default: return true; @@ -5318,6 +6027,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: + case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: + case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: return false; default: return true; @@ -5342,10 +6057,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Check switch flag if (NoFusing) return nullptr; - // Unless optimizing for size, don't fold to avoid partial - // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + // Avoid partial register update stalls unless optimizing for size. + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; // Determine the alignment of the load. @@ -5460,62 +6173,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( /*Size=*/0, Alignment, /*AllowCommute=*/true); } -bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // Check switch flag - if (NoFusing) return 0; - - if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { - switch (MI->getOpcode()) { - default: return false; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - return true; - case X86::ADD32ri: - // FIXME: AsmPrinter doesn't know how to handle - // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. - if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return false; - break; - } - } - - if (Ops.size() != 1) - return false; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - unsigned NumOps = MI->getDesc().getNumOperands(); - bool isTwoAddr = NumOps > 1 && - MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // Folding a memory location into the two-address part of a two-address - // instruction is different than folding it other places. It requires - // replacing the *two* registers with the memory location. - const DenseMap<unsigned, - std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; - if (isTwoAddr && NumOps >= 2 && OpNum < 2) { - OpcodeTablePtr = &RegOp2MemOpTable2Addr; - } else if (OpNum == 0) { - if (Opc == X86::MOV32r0) - return true; - - OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (OpNum == 1) { - OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (OpNum == 2) { - OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (OpNum == 3) { - OpcodeTablePtr = &RegOp2MemOpTable3; - } - - if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) - return true; - return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); -} - bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr*> &NewMIs) const { @@ -5536,9 +6193,10 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + // TODO: Check if 32-byte or greater accesses are slow too? if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -5582,20 +6240,19 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, if (FoldedStore) MIB.addReg(Reg, RegState::Define); - for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) - MIB.addOperand(BeforeOps[i]); + for (MachineOperand &BeforeOp : BeforeOps) + MIB.addOperand(BeforeOp); if (FoldedLoad) MIB.addReg(Reg); - for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) - MIB.addOperand(AfterOps[i]); - for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { - MachineOperand &MO = ImpOps[i]; - MIB.addReg(MO.getReg(), - getDefRegState(MO.isDef()) | + for (MachineOperand &AfterOp : AfterOps) + MIB.addOperand(AfterOp); + for (MachineOperand &ImpOp : ImpOps) { + MIB.addReg(ImpOp.getReg(), + getDefRegState(ImpOp.isDef()) | RegState::Implicit | - getKillRegState(MO.isKill()) | - getDeadRegState(MO.isDead()) | - getUndefRegState(MO.isUndef())); + getKillRegState(ImpOp.isKill()) | + getDeadRegState(ImpOp.isDead()) | + getUndefRegState(ImpOp.isUndef())); } // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (DataMI->getOpcode()) { @@ -5686,9 +6343,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned load. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -5729,9 +6388,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned store. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -6192,16 +6853,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) - if (ReplaceableInstrs[i][domain-1] == opcode) - return ReplaceableInstrs[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) - if (ReplaceableInstrsAVX2[i][domain-1] == opcode) - return ReplaceableInstrsAVX2[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; return nullptr; } @@ -6347,230 +7008,181 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel, return isHighLatencyDef(DefMI->getOpcode()); } -static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst, - const MachineBasicBlock *MBB) { - assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators"); - const MachineOperand &Op1 = Inst.getOperand(1); - const MachineOperand &Op2 = Inst.getOperand(2); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - // We need virtual register definitions. - MachineInstr *MI1 = nullptr; - MachineInstr *MI2 = nullptr; - if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) - MI1 = MRI.getUniqueVRegDef(Op1.getReg()); - if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) - MI2 = MRI.getUniqueVRegDef(Op2.getReg()); - - // And they need to be in the trace (otherwise, they won't have a depth). - if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB) - return true; +bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const { + assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) && + "Reassociation needs binary operators"); + + // Integer binary math/logic instructions have a third source operand: + // the EFLAGS register. That operand must be both defined here and never + // used; ie, it must be dead. If the EFLAGS operand is live, then we can + // not change anything because rearranging the operands could affect other + // instructions that depend on the exact status flags (zero, sign, etc.) + // that are set by using these particular operands with this operation. + if (Inst.getNumOperands() == 4) { + assert(Inst.getOperand(3).isReg() && + Inst.getOperand(3).getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + if (!Inst.getOperand(3).isDead()) + return false; + } - return false; -} - -static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) { - const MachineBasicBlock *MBB = Inst.getParent(); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); - MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); - unsigned AssocOpcode = Inst.getOpcode(); - - // If only one operand has the same opcode and it's the second source operand, - // the operands must be commuted. - Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; - if (Commuted) - std::swap(MI1, MI2); - - // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must have virtual register definitions for its - // operands in the same basic block as Inst. - // 3. The previous instruction's result must only be used by Inst. - if (MI1->getOpcode() == AssocOpcode && - hasVirtualRegDefsInBasicBlock(*MI1, MBB) && - MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) - return true; - - return false; + return TargetInstrInfo::hasReassociableOperands(Inst, MBB); } // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) -// 2. Other math / logic operations (and, or) -static bool isAssociativeAndCommutative(unsigned Opcode) { - switch (Opcode) { +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::AND8rr: + case X86::AND16rr: + case X86::AND32rr: + case X86::AND64rr: + case X86::OR8rr: + case X86::OR16rr: + case X86::OR32rr: + case X86::OR64rr: + case X86::XOR8rr: + case X86::XOR16rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::IMUL16rr: + case X86::IMUL32rr: + case X86::IMUL64rr: + case X86::PANDrr: + case X86::PORrr: + case X86::PXORrr: + case X86::VPANDrr: + case X86::VPANDYrr: + case X86::VPORrr: + case X86::VPORYrr: + case X86::VPXORrr: + case X86::VPXORYrr: + // Normal min/max instructions are not commutative because of NaN and signed + // zero semantics, but these are. Thus, there's no need to check for global + // relaxed math; the instructions themselves have the properties we need. + case X86::MAXCPDrr: + case X86::MAXCPSrr: + case X86::MAXCSDrr: + case X86::MAXCSSrr: + case X86::MINCPDrr: + case X86::MINCPSrr: + case X86::MINCSDrr: + case X86::MINCSSrr: + case X86::VMAXCPDrr: + case X86::VMAXCPSrr: + case X86::VMAXCPDYrr: + case X86::VMAXCPSYrr: + case X86::VMAXCSDrr: + case X86::VMAXCSSrr: + case X86::VMINCPDrr: + case X86::VMINCPSrr: + case X86::VMINCPDYrr: + case X86::VMINCPSYrr: + case X86::VMINCSDrr: + case X86::VMINCSSrr: + return true; + case X86::ADDPDrr: + case X86::ADDPSrr: case X86::ADDSDrr: case X86::ADDSSrr: - case X86::VADDSDrr: - case X86::VADDSSrr: + case X86::MULPDrr: + case X86::MULPSrr: case X86::MULSDrr: case X86::MULSSrr: + case X86::VADDPDrr: + case X86::VADDPSrr: + case X86::VADDPDYrr: + case X86::VADDPSYrr: + case X86::VADDSDrr: + case X86::VADDSSrr: + case X86::VMULPDrr: + case X86::VMULPSrr: + case X86::VMULPDYrr: + case X86::VMULPSYrr: case X86::VMULSDrr: case X86::VMULSSrr: - return true; + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; default: return false; } } -/// Return true if the input instruction is part of a chain of dependent ops -/// that are suitable for reassociation, otherwise return false. -/// If the instruction's operands must be commuted to have a previous -/// instruction of the same type define the first source operand, Commuted will -/// be set to true. -static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) { - // 1. The operation must be associative and commutative. - // 2. The instruction must have virtual register definitions for its - // operands in the same basic block. - // 3. The instruction must have a reassociable sibling. - if (isAssociativeAndCommutative(Inst.getOpcode()) && - hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) && - hasReassocSibling(Inst, Commuted)) - return true; - - return false; -} - -// FIXME: This has the potential to be expensive (compile time) while not -// improving the code at all. Some ways to limit the overhead: -// 1. Track successful transforms; bail out if hit rate gets too low. -// 2. Only enable at -O3 or some other non-default optimization level. -// 3. Pre-screen pattern candidates here: if an operand of the previous -// instruction is known to not increase the critical path, then don't match -// that pattern. -bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { - if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) - return false; - - // TODO: There is nothing x86-specific here except the instruction type. - // This logic could be hoisted into the machine combiner pass itself. - - // Look for this reassociation pattern: - // B = A op X (Prev) - // C = B op Y (Root) - - bool Commute; - if (isReassocCandidate(Root, Commute)) { - // We found a sequence of instructions that may be suitable for a - // reassociation of operands to increase ILP. Specify each commutation - // possibility for the Prev instruction in the sequence and let the - // machine combiner decide if changing the operands is worthwhile. - if (Commute) { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB); - } else { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY); - } - return true; - } +/// This is an architecture-specific helper function of reassociateOps. +/// Set special operand attributes for new instructions after reassociation. +void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, + MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const { + // Integer instructions define an implicit EFLAGS source register operand as + // the third source (fourth total) operand. + if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4) + return; - return false; + assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 && + "Unexpected instruction type for reassociation"); + + MachineOperand &OldOp1 = OldMI1.getOperand(3); + MachineOperand &OldOp2 = OldMI2.getOperand(3); + MachineOperand &NewOp1 = NewMI1.getOperand(3); + MachineOperand &NewOp2 = NewMI2.getOperand(3); + + assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + + (void)OldOp1; + (void)OldOp2; + + assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + + // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations + // of this pass or other passes. The EFLAGS operands must be dead in these new + // instructions because the EFLAGS operands in the original instructions must + // be dead in order for reassociation to occur. + NewOp1.setIsDead(); + NewOp2.setIsDead(); } -/// Attempt the following reassociation to reduce critical path length: -/// B = A op X (Prev) -/// C = B op Y (Root) -/// ===> -/// B = X op Y -/// C = A op B -static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { - MachineFunction *MF = Root.getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); - - // This array encodes the operand index for each parameter because the - // operands may be commuted. Each row corresponds to a pattern value, - // and each column specifies the index of A, B, X, Y. - unsigned OpIdx[4][4] = { - { 1, 1, 2, 2 }, - { 1, 2, 2, 1 }, - { 2, 1, 1, 2 }, - { 2, 2, 1, 1 } - }; - - MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]); - MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]); - MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); - MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); - MachineOperand &OpC = Root.getOperand(0); - - unsigned RegA = OpA.getReg(); - unsigned RegB = OpB.getReg(); - unsigned RegX = OpX.getReg(); - unsigned RegY = OpY.getReg(); - unsigned RegC = OpC.getReg(); - - if (TargetRegisterInfo::isVirtualRegister(RegA)) - MRI.constrainRegClass(RegA, RC); - if (TargetRegisterInfo::isVirtualRegister(RegB)) - MRI.constrainRegClass(RegB, RC); - if (TargetRegisterInfo::isVirtualRegister(RegX)) - MRI.constrainRegClass(RegX, RC); - if (TargetRegisterInfo::isVirtualRegister(RegY)) - MRI.constrainRegClass(RegY, RC); - if (TargetRegisterInfo::isVirtualRegister(RegC)) - MRI.constrainRegClass(RegC, RC); - - // Create a new virtual register for the result of (X op Y) instead of - // recycling RegB because the MachineCombiner's computation of the critical - // path requires a new register definition rather than an existing one. - unsigned NewVR = MRI.createVirtualRegister(RC); - InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); - - unsigned Opcode = Root.getOpcode(); - bool KillA = OpA.isKill(); - bool KillX = OpX.isKill(); - bool KillY = OpY.isKill(); - - // Create new instructions for insertion. - MachineInstrBuilder MIB1 = - BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) - .addReg(RegX, getKillRegState(KillX)) - .addReg(RegY, getKillRegState(KillY)); - InsInstrs.push_back(MIB1); - - MachineInstrBuilder MIB2 = - BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) - .addReg(RegA, getKillRegState(KillA)) - .addReg(NewVR, getKillRegState(true)); - InsInstrs.push_back(MIB2); - - // Record old instructions for deletion. - DelInstrs.push_back(&Prev); - DelInstrs.push_back(&Root); +std::pair<unsigned, unsigned> +X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF, 0u); } -void X86InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const { - MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); - - // Select the previous instruction in the sequence based on the input pattern. - MachineInstr *Prev = nullptr; - switch (Pattern) { - case MachineCombinerPattern::MC_REASSOC_AX_BY: - case MachineCombinerPattern::MC_REASSOC_XA_BY: - Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - break; - case MachineCombinerPattern::MC_REASSOC_AX_YB: - case MachineCombinerPattern::MC_REASSOC_XA_YB: - Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); - } - assert(Prev && "Unknown pattern for machine combiner"); - - reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); - return; +ArrayRef<std::pair<unsigned, const char *>> +X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace X86II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, + {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, + {MO_GOT, "x86-got"}, + {MO_GOTOFF, "x86-gotoff"}, + {MO_GOTPCREL, "x86-gotpcrel"}, + {MO_PLT, "x86-plt"}, + {MO_TLSGD, "x86-tlsgd"}, + {MO_TLSLD, "x86-tlsld"}, + {MO_TLSLDM, "x86-tlsldm"}, + {MO_GOTTPOFF, "x86-gottpoff"}, + {MO_INDNTPOFF, "x86-indntpoff"}, + {MO_TPOFF, "x86-tpoff"}, + {MO_DTPOFF, "x86-dtpoff"}, + {MO_NTPOFF, "x86-ntpoff"}, + {MO_GOTNTPOFF, "x86-gotntpoff"}, + {MO_DLLIMPORT, "x86-dllimport"}, + {MO_DARWIN_STUB, "x86-darwin-stub"}, + {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, + {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, + {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"}, + {MO_TLVP, "x86-tlvp"}, + {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, + {MO_SECREL, "x86-secrel"}}; + return makeArrayRef(TargetFlags); } namespace { diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index bf63336c7005..9d40334206b2 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -23,22 +23,10 @@ #include "X86GenInstrInfo.inc" namespace llvm { + class MachineInstrBuilder; class X86RegisterInfo; class X86Subtarget; - namespace MachineCombinerPattern { - enum MC_PATTERN : int { - // These are commutative variants for reassociating a computation chain - // of the form: - // B = A op X (Prev) - // C = B op Y (Root) - MC_REASSOC_AX_BY = 0, - MC_REASSOC_AX_YB = 1, - MC_REASSOC_XA_BY = 2, - MC_REASSOC_XA_YB = 3, - }; - } // end namespace MachineCombinerPattern - namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. @@ -259,14 +247,64 @@ public: MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const override; - /// commuteInstruction - We have a few instructions that must be hacked on to - /// commute them. + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value + /// 'CommuteAnyOperandIndex' assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. /// - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; - + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + /// findCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + /// Returns true if the routine could find two commutable operands + /// in the given FMA instruction. Otherwise, returns false. + /// + /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments. + /// The output indices of the commuted operands are returned in these + /// arguments. Also, the input values of these arguments may be preset either + /// to indices of operands that must be commuted or be equal to a special + /// value 'CommuteAnyOperandIndex' which means that the corresponding + /// operand index is not set and this method is free to pick any of + /// available commutable operands. + /// + /// For example, calling this method this way: + /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// can be interpreted as a query asking if the operand #1 can be swapped + /// with any other available operand (e.g. operand #2, operand #3, etc.). + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + bool findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given MI but which has the operands + /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// + /// The returned FMA opcode may differ from the opcode in the given \p MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr* MI) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -342,11 +380,6 @@ public: MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const override; - /// canFoldMemoryOperand - Returns true if the specified load / store is - /// folding is possible. - bool canFoldMemoryOperand(const MachineInstr *, - ArrayRef<unsigned>) const override; - /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is /// possible, returns true as well as the new instructions by reference. @@ -406,10 +439,9 @@ public: bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; - static bool isX86_64ExtendedReg(const MachineOperand &MO) { - if (!MO.isReg()) return false; - return X86II::isX86_64ExtendedReg(MO.getReg()); - } + /// True if MI has a condition code def, e.g. EFLAGS, that is + /// not marked dead. + bool hasLiveCondCodeDef(MachineInstr *MI) const; /// getGlobalBaseReg - Return a virtual register initialized with the /// the global base register value. Output instructions required to @@ -452,26 +484,19 @@ public: const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; - bool useMachineCombiner() const override { return true; } - - /// Return true when there is potentially a faster code sequence - /// for an instruction chain ending in <Root>. All potential patterns are - /// output in the <Pattern> array. - bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override; - - /// When getMachineCombinerPatterns() finds a pattern, this function generates - /// the instructions that could replace the original code sequence. - void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + + bool hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const override; + + void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it @@ -500,16 +525,49 @@ public: unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + +protected: + /// Commutes the operands in the given instruction by changing the operands + /// order and/or changing the instruction's opcode and/or the immediate value + /// operand. + /// + /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands + /// to be commuted. + /// + /// Do not call this method for a non-commutable instruction or + /// non-commutable operands. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI, + unsigned CommuteOpIdx1, + unsigned CommuteOpIdx2) const override; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; + /// Handles memory folding for special case instructions, for instance those + /// requiring custom manipulation of the address. + MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const; + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const; + + /// Expand the MOVImmSExti8 pseudo-instructions. + bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 52bab9c79b45..f4ca2b880bad 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -106,8 +106,6 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; -def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>; - def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -158,6 +156,8 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue]>; def X86vastart_save_xmm_regs : SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", @@ -250,9 +250,6 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL, - [SDNPHasChain, SDNPOutGlue]>; - //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -344,18 +341,21 @@ def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>; def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; -// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of -// plain GR64, so that it doesn't potentially require a REX prefix. -def i8mem_NOREX : Operand<i64> { +// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead +// of a plain GPR, so that it doesn't potentially require a REX prefix. +def ptr_rc_norex : PointerLikeRegClass<2>; +def ptr_rc_norex_nosp : PointerLikeRegClass<3>; + +def i8mem_NOREX : Operand<iPTR> { let PrintMethod = "printi8mem"; - let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm); + let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm); let ParserMatchClass = X86Mem8AsmOperand; let OperandType = "OPERAND_MEMORY"; } // GPRs available for tailcall. // It represents GR32_TC, GR64_TC or GR64_TCW64. -def ptr_rc_tailcall : PointerLikeRegClass<2>; +def ptr_rc_tailcall : PointerLikeRegClass<4>; // Special i32mem for addresses of load folding tail calls. These are not // allowed to use callee-saved registers since they must be scheduled @@ -697,34 +697,34 @@ def lea64mem : Operand<i64> { // X86 Complex Pattern Definitions. // -// Define X86 specific addressing mode. -def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>; -def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", +// Define X86-specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex], []>; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. -def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr", +def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; -def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", +def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", +def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", +def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; -def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", +def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", +def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; +def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. @@ -767,12 +767,21 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">, def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">, AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; +def NoBWI : Predicate<"!Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">, AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; +def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; +def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; +def PKU : Predicate<"!Subtarget->hasPKU()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; +def HasFXSR : Predicate<"Subtarget->hasFXSR()">; +def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; +def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; +def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; +def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; @@ -794,6 +803,7 @@ def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; @@ -812,6 +822,8 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" + "Subtarget->getFrameLowering()->hasFP(*MF)">; def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -825,6 +837,7 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; def OptForSize : Predicate<"OptForSize">; +def OptForMinSize : Predicate<"OptForMinSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; @@ -867,20 +880,54 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{ }]>; -def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; -def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; -def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; + +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; -def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit // unsigned field. -def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; +def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; def i64immZExt32SExt8 : ImmLeaf<i64, [{ - return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; + return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm)); }]>; // Helper fragments for loads. @@ -914,11 +961,12 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ return false; }]>; -def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; -def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; -def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; -def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; -def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; @@ -1020,12 +1068,8 @@ def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], IIC_PUSH_REG>, OpSize16; -def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], - IIC_PUSH_MEM>, OpSize16; def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; -def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], - IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; @@ -1039,6 +1083,14 @@ def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; } // mayStore, SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], + IIC_PUSH_MEM>, OpSize16; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], + IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, mayStore, SchedRW + } let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, @@ -1071,9 +1123,11 @@ def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; +} // mayStore, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>; -} // mayStore, SchedRW +} // mayLoad, mayStore, SchedRW } let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, @@ -1275,13 +1329,13 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; + [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; + [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; + [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; @@ -1457,10 +1511,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem, let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", - [(set EFLAGS, (X86sahf AH))], IIC_AHF>; + [(set EFLAGS, (X86sahf AH))], IIC_AHF>, + Requires<[HasLAHFSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], - IIC_AHF>; // AH = flags + IIC_AHF>, // AH = flags + Requires<[HasLAHFSAHF]>; } // SchedRW //===----------------------------------------------------------------------===// @@ -1894,37 +1950,38 @@ def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; } // Table lookup instructions +let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, Sched<[WriteLoad]>; let SchedRW = [WriteMicrocoded] in { // ASCII Adjust After Addition -// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, Requires<[Not64BitMode]>; // ASCII Adjust AX Before Division -// sets AL, AH and EFLAGS and uses AL and AH +let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>; // ASCII Adjust AX After Multiply -// sets AL, AH and EFLAGS and uses AL +let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>; // ASCII Adjust AL After Subtraction - sets -// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Addition -// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Subtraction -// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, Requires<[Not64BitMode]>; } // SchedRW @@ -2357,6 +2414,32 @@ defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; } // HasTBM, EFLAGS //===----------------------------------------------------------------------===// +// MONITORX/MWAITX Instructions +// +let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in +def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], + IIC_SSE_MONITOR>, TB; +let Uses = [ECX, EAX, EBX] in +def MWAITXrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>, + TB; +} // SchedRW + +def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>; +def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, + Requires<[Not64BitMode]>; +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// CLZERO Instruction +// +let Uses = [EAX] in +def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB; + +//===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. //===----------------------------------------------------------------------===// @@ -2498,8 +2581,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; -def : MnemonicAlias<"loopz", "loope", "att">; -def : MnemonicAlias<"loopnz", "loopne", "att">; +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; @@ -2532,14 +2615,15 @@ def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; -def : MnemonicAlias<"repe", "rep", "att">; -def : MnemonicAlias<"repz", "rep", "att">; -def : MnemonicAlias<"repnz", "repne", "att">; +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sal", "shl", "intel">; def : MnemonicAlias<"salb", "shlb", "att">; def : MnemonicAlias<"salw", "shlw", "att">; def : MnemonicAlias<"sall", "shll", "att">; @@ -2579,14 +2663,14 @@ def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; -def : MnemonicAlias<"fcomip", "fcompi", "att">; +def : MnemonicAlias<"fcomip", "fcompi">; def : MnemonicAlias<"fildq", "fildll", "att">; def : MnemonicAlias<"fistpq", "fistpll", "att">; def : MnemonicAlias<"fisttpq", "fisttpll", "att">; def : MnemonicAlias<"fldcww", "fldcw", "att">; def : MnemonicAlias<"fnstcww", "fnstcw", "att">; def : MnemonicAlias<"fnstsww", "fnstsw", "att">; -def : MnemonicAlias<"fucomip", "fucompi", "att">; +def : MnemonicAlias<"fucomip", "fucompi">; def : MnemonicAlias<"fwait", "wait">; def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; @@ -2594,7 +2678,9 @@ def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; def : MnemonicAlias<"xsaveq", "xsave64", "att">; def : MnemonicAlias<"xrstorq", "xrstor64", "att">; def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; - +def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; +def : MnemonicAlias<"xsavecq", "xsavec64", "att">; +def : MnemonicAlias<"xsavesq", "xsaves64", "att">; class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, string VariantName> @@ -2640,8 +2726,8 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; //===----------------------------------------------------------------------===// // aad/aam default to base 10 if no operand is specified. -def : InstAlias<"aad", (AAD8i8 10)>; -def : InstAlias<"aam", (AAM8i8 10)>; +def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; +def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. @@ -2719,8 +2805,10 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; @@ -2798,20 +2886,20 @@ def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16Bit // "imul <imm>, B" is an alias for "imul <imm>, B, B". -def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; -def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; -def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; -def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; -def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; -def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; +def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; -def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>; -def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>; -def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>; +def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp @@ -2861,9 +2949,9 @@ def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16: def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; -def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>; -def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>; -def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; +def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity @@ -2940,3 +3028,34 @@ def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>; def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; + +// These aliases exist to get the parser to prioritize matching 8-bit +// immediate encodings over matching the implicit ax/eax/rax encodings. By +// explicitly mentioning the A register here, these entries will be ordered +// first due to the more explicit immediate type. +def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; + +def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; + +def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index eaa7894004cb..11dc1e7d466b 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -249,6 +249,7 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; +let isBitcast = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (bitconvert GR64:$src))], @@ -262,7 +263,7 @@ def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. -let SchedRW = [WriteMove] in { +let SchedRW = [WriteMove], isBitcast = 1 in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", @@ -303,7 +304,7 @@ def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (bitconvert - (i64 (vector_extract (v2i64 VR128:$src), + (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))))))], IIC_MMX_MOVQ_RR>; @@ -326,6 +327,7 @@ def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), } } // SchedRW +let Predicates = [HasSSE1] in def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], @@ -355,6 +357,7 @@ defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, MMX_INTALU_ITINS, 1>; defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, MMX_INTALU_ITINS, 1>; +let Predicates = [HasSSE2] in defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, MMX_INTALUQ_ITINS, 1>; defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, @@ -382,6 +385,7 @@ defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, MMX_INTALU_ITINS>; defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, MMX_INTALU_ITINS>; +let Predicates = [HasSSE2] in defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, MMX_INTALUQ_ITINS>; @@ -408,8 +412,10 @@ defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE1] in defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE2] in defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, MMX_PMUL_ITINS, 1>; let isCommutable = 1 in @@ -422,6 +428,7 @@ defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>; +let Predicates = [HasSSE1] in { defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, MMX_MISC_FUNC_ITINS, 1>; defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, @@ -439,6 +446,7 @@ defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, MMX_PSADBW_ITINS, 1>; +} defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, MMX_MISC_FUNC_ITINS>; @@ -594,6 +602,7 @@ let Constraints = "$src1 = $dst" in { } // Extract / Insert +let Predicates = [HasSSE1] in def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -601,6 +610,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, imm:$src2))], IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { +let Predicates = [HasSSE1] in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), @@ -618,8 +628,10 @@ let Constraints = "$src1 = $dst" in { imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; } +} // Mask creation +let Predicates = [HasSSE1] in def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src), "pmovmskb\t{$src, $dst|$dst, $src}", @@ -639,12 +651,12 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), // Misc. let SchedRW = [WriteShuffle] in { -let Uses = [EDI] in +let Uses = [EDI], Predicates = [HasSSE1,In32BitMode] in def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], IIC_MMX_MASKMOV>; -let Uses = [RDI] in +let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], @@ -653,10 +665,6 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), // 64-bit bit convert. let Predicates = [HasSSE2] in { -def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), - (MMX_MOVD64to64rr GR64:$src)>; -def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), - (MMX_MOVD64from64rr VR64:$src)>; def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 99386b0658ad..7a44212bd829 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -330,9 +330,9 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, //===----------------------------------------------------------------------===// // A vector extract of the first f32/f64 position is a subregister copy -def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), +def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; -def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), +def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; // A 128-bit subvector extract from the first 256-bit vector position @@ -413,6 +413,8 @@ let Predicates = [HasSSE2] in { def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; + def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; } // Bitcasts between 256-bit vector types. Return the original type since @@ -650,10 +652,10 @@ let Predicates = [UseAVX] in { } // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; @@ -736,7 +738,7 @@ let Predicates = [UseSSE1] in { } // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; @@ -770,7 +772,7 @@ let Predicates = [UseSSE2] in { } // Extract and store. - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst), (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; @@ -935,22 +937,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, IIC_SSE_MOVU_P_RR>, VEX, VEX_L; } -let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86vzmovl - (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v4i64 (X86vzmovl - (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v8f32 (X86vzmovl - (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v4f64 (X86vzmovl - (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -} - - def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), @@ -1172,12 +1158,13 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, string base_opc, InstrItinClass itin> { - defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + let Predicates = [UseAVX] in + defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", itin>, VEX_4V; -let Constraints = "$src1 = $dst" in - defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + let Constraints = "$src1 = $dst" in + defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $dst|$dst, $src2}", itin>; } @@ -1188,29 +1175,31 @@ let AddedComplexity = 20 in { } let SchedRW = [WriteStore] in { +let Predicates = [UseAVX] in { def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (v2f64 VR128:$src), + [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +}// UseAVX def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (v2f64 VR128:$src), + [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; } // SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // Shuffle with VMOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (VMOVLPSrm VR128:$src1, addr:$src2)>; @@ -1243,7 +1232,7 @@ let Predicates = [HasAVX] in { let Predicates = [UseSSE1] in { // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), + def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), (iPTR 0))), addr:$src1), (MOVLPSmr addr:$src1, VR128:$src2)>; @@ -1297,31 +1286,33 @@ let AddedComplexity = 20 in { let SchedRW = [WriteStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. +let Predicates = [UseAVX] in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +} // UseAVX def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; } // SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // VMOVHPS patterns def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), @@ -1345,7 +1336,7 @@ let Predicates = [HasAVX] in { (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(store (f64 (vector_extract + def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; @@ -1377,7 +1368,7 @@ let Predicates = [UseSSE2] in { (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(store (f64 (vector_extract + def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; @@ -2073,14 +2064,16 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { - def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), (VCVTDQ2PSrm addr:$src)>; +} - def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PSrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), @@ -2149,7 +2142,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), @@ -2306,7 +2299,9 @@ let Predicates = [HasAVX] in { (VCVTDQ2PSYrr VR256:$src)>; def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), (VCVTDQ2PSYrm addr:$src)>; +} +let Predicates = [HasAVX, NoVLX] in { // Match fround and fextend for 128/256-bit conversions def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (VCVTPD2PSrr VR128:$src)>; @@ -2452,9 +2447,9 @@ let Defs = [EFLAGS] in { defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, "ucomisd">, PD, VEX, VEX_LIG; let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, "comiss">, PS, VEX, VEX_LIG; - defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, "comisd">, PD, VEX, VEX_LIG; } @@ -2475,9 +2470,9 @@ let Defs = [EFLAGS] in { "ucomisd">, PD; let Pattern = []<dag> in { - defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, "comiss">, PS; - defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, "comisd">, PD; } @@ -2605,19 +2600,20 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, Sched<[WriteFShuffle]>; } -defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, +let Predicates = [HasAVX, NoVLX] in { + defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv4f32, SSEPackedSingle>, PS, VEX_4V; -defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; -defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv2f64, SSEPackedDouble>, PD, VEX_4V; -defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; - +} let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", @@ -2627,7 +2623,7 @@ let Constraints = "$src1 = $dst" in { memopv2f64, SSEPackedDouble>, PD; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Shufp VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; @@ -2694,6 +2690,7 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, Sched<[WriteFShuffleLd, ReadAfterLd]>; } +let Predicates = [HasAVX, NoVLX] in { defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, VEX_4V; @@ -2719,7 +2716,7 @@ defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, VEX_4V, VEX_L; - +}// Predicates = [HasAVX, NoVLX] let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", @@ -2845,8 +2842,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, ValueType OpVT128, ValueType OpVT256, - OpndItins itins, bit IsCommutable = 0> { -let Predicates = [HasAVX, NoVLX] in + OpndItins itins, bit IsCommutable = 0, Predicate prd> { +let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; @@ -2854,7 +2851,7 @@ let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, memopv2i64, i128mem, itins, IsCommutable, 1>; -let Predicates = [HasAVX2, NoVLX] in +let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT256, VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; @@ -2863,13 +2860,13 @@ let Predicates = [HasAVX2, NoVLX] in // These are ordered here for pattern ordering requirements with the fp versions defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 0>; + SSE_VEC_BIT_ITINS_P, 0, NoVLX>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions @@ -2911,7 +2908,7 @@ let isCodeGenOnly = 1 in { // Multiclass for vectors using the X86 logical operation aliases for FP. multiclass sse12_fp_packed_vector_logical_alias< bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { - let Predicates = [HasAVX, NoVLX] in { + let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V; @@ -2923,7 +2920,7 @@ multiclass sse12_fp_packed_vector_logical_alias< defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, PS, VEX_4V, VEX_L; - + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, PD, VEX_4V, VEX_L; @@ -3183,7 +3180,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE1] in { // extracted scalar math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))))), (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3198,7 +3195,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE41] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (i8 1))), (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3215,7 +3212,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [HasAVX] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (i8 1))), (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3241,7 +3238,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE2] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3256,7 +3253,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE41] in { // extracted scalar math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))), (i8 1))), (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3271,14 +3268,14 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [HasAVX] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; // extracted scalar math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))), (i8 1))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3449,8 +3446,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, /// sse1_fp_unop_p - SSE1 unops in packed form. multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { -let Predicates = [HasAVX] in { + OpndItins itins, list<Predicate> prds> { +let Predicates = prds in { def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), @@ -3546,16 +3543,16 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>; + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>; + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -4018,39 +4015,43 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, } // ExeDomain = SSEPackedInt defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX>; defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, - SSE_INTALUQ_ITINS_P, 1>; + SSE_INTALUQ_ITINS_P, 1, NoVLX>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX>; defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, - SSE_INTALUQ_ITINS_P, 0>; + SSE_INTALUQ_ITINS_P, 0, NoVLX>; defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4067,26 +4068,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; -defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_PMADD, 1>; - -let Predicates = [HasAVX2] in - def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPSADBWYrr VR256:$src2, VR256:$src1)>; let Predicates = [HasAVX] in - def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPSADBWrr VR128:$src2, VR128:$src1)>; - -def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PSADBWrr VR128:$src2, VR128:$src1)>; +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, @@ -4105,9 +4098,6 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, //===---------------------------------------------------------------------===// let Predicates = [HasAVX, NoVLX] in { -defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4115,9 +4105,6 @@ defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4125,14 +4112,26 @@ defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] + + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , + Predicates = [HasAVX, NoVLX_Or_NoBWI]in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -4147,13 +4146,9 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] let Predicates = [HasAVX2, NoVLX] in { -defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -4161,9 +4156,6 @@ defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -4171,14 +4163,25 @@ defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX_Or_NoBWI] + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , + Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), @@ -4193,8 +4196,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX2] +} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] let Constraints = "$src1 = $dst" in { defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, @@ -4247,17 +4249,17 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { //===---------------------------------------------------------------------===// defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX>; defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions @@ -4511,40 +4513,43 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, bc_v8i16, loadv2i64, 0>, VEX_4V; - defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - bc_v4i32, loadv2i64, 0>, VEX_4V; - defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - bc_v2i64, loadv2i64, 0>, VEX_4V; - defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, bc_v8i16, loadv2i64, 0>, VEX_4V; +} +let Predicates = [HasAVX, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, + bc_v4i32, loadv2i64, 0>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, + bc_v2i64, loadv2i64, 0>, VEX_4V; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, bc_v2i64, loadv2i64, 0>, VEX_4V; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, bc_v16i16>, VEX_4V, VEX_L; - defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, - bc_v8i32>, VEX_4V, VEX_L; - defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, - bc_v4i64>, VEX_4V, VEX_L; - defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, bc_v16i16>, VEX_4V, VEX_L; +} +let Predicates = [HasAVX2, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, + bc_v4i64>, VEX_4V, VEX_L; defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, bc_v8i32>, VEX_4V, VEX_L; defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, @@ -4600,7 +4605,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { } // Extract -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4615,7 +4620,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, Sched<[WriteShuffleLd, ReadAfterLd]>; // Insert -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in @@ -4683,7 +4688,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), } // ExeDomain = SSEPackedInt //===---------------------------------------------------------------------===// -// SSE2 - Move Doubleword +// SSE2 - Move Doubleword/Quadword //===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===// @@ -4770,23 +4775,23 @@ let isCodeGenOnly = 1 in { // def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128:$src), + [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128:$src), + [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4808,24 +4813,25 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), let SchedRW = [WriteMove] in { def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), - (iPTR 0)))], + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX; def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>; } //SchedRW let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst), - (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", +def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src), +def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4883,30 +4889,18 @@ let isCodeGenOnly = 1 in { IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } -//===---------------------------------------------------------------------===// -// Patterns and instructions to describe movd/movq to XMM register zero-extends -// -let isCodeGenOnly = 1, SchedRW = [WriteMove] in { -let AddedComplexity = 15 in { -def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "movq\t{$src, $dst|$dst, $src}", // X86-64 only - [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))], - IIC_SSE_MOVDQ>, - VEX, VEX_W; -def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only - [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))], - IIC_SSE_MOVDQ>; -} -} // isCodeGenOnly, SchedRW - let Predicates = [UseAVX] in { - let AddedComplexity = 15 in + let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIrr GR32:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIrr GR64:$src)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; + } // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. let AddedComplexity = 20 in { @@ -4924,16 +4918,16 @@ let Predicates = [UseAVX] in { def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in + let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (MOVDI2PDIrr GR32:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (MOV64toPQIrr GR64:$src)>; + } let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; @@ -4985,12 +4979,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (vector_extract (v2i64 VR128:$src), + [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX; def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (vector_extract (v2i64 VR128:$src), + [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>; } // ExeDomain, SchedRW @@ -5119,7 +5113,7 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", v4f32, VR128, loadv4f32, f128mem>, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", @@ -5134,7 +5128,7 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, memopv4f32, f128mem>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), @@ -5190,21 +5184,30 @@ def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (v4f64 (X86Movddup - (scalar_to_vector (loadf64 addr:$src)))))]>, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (loadv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), @@ -5212,16 +5215,6 @@ let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - - // 256-bit version - def : Pat<(X86Movddup (loadv4f64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (loadv4i64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; } let Predicates = [UseAVX, OptForSize] in { @@ -5791,37 +5784,37 @@ let Predicates = [HasAVX2] in let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGN : ssse3_palignr<"palignr">; -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; } let Predicates = [UseSSSE3] in { def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; } //===---------------------------------------------------------------------===// @@ -6145,7 +6138,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { imm:$src2)))), addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; defm PEXTRB : SS41I_extract8<0x14, "pextrb">; @@ -6170,7 +6163,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { imm:$src2)))), addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -6194,7 +6187,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -6217,7 +6210,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { addr:$dst)]>, REX_W; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -6285,7 +6278,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -6311,7 +6304,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -6337,7 +6330,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -6543,71 +6536,71 @@ let Predicates = [HasAVX] in { let Predicates = [UseAVX] in { def : Pat<(ffloor FR32:$src), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; } let Predicates = [HasAVX] in { def : Pat<(v4f32 (ffloor VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x1))>; + (VROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), (VROUNDPSr VR128:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x2))>; + (VROUNDPSr VR128:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x3))>; + (VROUNDPSr VR128:$src, (i32 0xB))>; def : Pat<(v2f64 (ffloor VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x1))>; + (VROUNDPDr VR128:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128:$src)), (VROUNDPDr VR128:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x2))>; + (VROUNDPDr VR128:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128:$src)), (VROUNDPDr VR128:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x3))>; + (VROUNDPDr VR128:$src, (i32 0xB))>; def : Pat<(v8f32 (ffloor VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x1))>; + (VROUNDYPSr VR256:$src, (i32 0x9))>; def : Pat<(v8f32 (fnearbyint VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0xC))>; def : Pat<(v8f32 (fceil VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x2))>; + (VROUNDYPSr VR256:$src, (i32 0xA))>; def : Pat<(v8f32 (frint VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0x4))>; def : Pat<(v8f32 (ftrunc VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x3))>; + (VROUNDYPSr VR256:$src, (i32 0xB))>; def : Pat<(v4f64 (ffloor VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x1))>; + (VROUNDYPDr VR256:$src, (i32 0x9))>; def : Pat<(v4f64 (fnearbyint VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0xC))>; def : Pat<(v4f64 (fceil VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x2))>; + (VROUNDYPDr VR256:$src, (i32 0xA))>; def : Pat<(v4f64 (frint VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0x4))>; def : Pat<(v4f64 (ftrunc VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x3))>; + (VROUNDYPDr VR256:$src, (i32 0xB))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6619,47 +6612,47 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; def : Pat<(v4f32 (ffloor VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x1))>; + (ROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), (ROUNDPSr VR128:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x2))>; + (ROUNDPSr VR128:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x3))>; + (ROUNDPSr VR128:$src, (i32 0xB))>; def : Pat<(v2f64 (ffloor VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x1))>; + (ROUNDPDr VR128:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128:$src)), (ROUNDPDr VR128:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x2))>; + (ROUNDPDr VR128:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128:$src)), (ROUNDPDr VR128:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x3))>; + (ROUNDPDr VR128:$src, (i32 0xB))>; } //===----------------------------------------------------------------------===// @@ -7815,13 +7808,7 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // VBROADCAST - Load from memory and broadcast to all elements of the // destination operand // -class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> : - AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; - -class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, +class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, PatFrag ld_frag, SchedWrite Sched> : AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), @@ -7832,38 +7819,33 @@ class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, } // AVX2 adds register forms -class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, - Intrinsic Int, SchedWrite Sched> : +class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ResVT, ValueType OpVT, SchedWrite Sched> : AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; + [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, + Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, f32mem, v4f32, loadf32, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, f32mem, v8f32, loadf32, WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, +def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, v4f64, loadf64, WriteFShuffleLd>, VEX_L; -def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, - int_x86_avx_vbroadcastf128_pd_256, - WriteFShuffleLd>, VEX_L; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, - int_x86_avx2_vbroadcast_ss_ps, - WriteFShuffle>; - def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, - int_x86_avx2_vbroadcast_ss_ps_256, - WriteFShuffle256>, VEX_L; + def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, + v4f32, v4f32, WriteFShuffle>; + def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, + v8f32, v4f32, WriteFShuffle256>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256, - WriteFShuffle256>, VEX_L; +def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, + v4f64, v2f64, WriteFShuffle256>, VEX_L; let mayLoad = 1, Predicates = [HasAVX2] in def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), @@ -7871,6 +7853,13 @@ def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, Sched<[WriteLoad]>, VEX, VEX_L; +def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), + (ins f128mem:$src), + "vbroadcastf128\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, + Sched<[WriteFShuffleLd]>, VEX, VEX_L; + let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; @@ -7891,7 +7880,7 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, @@ -8080,17 +8069,19 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, (bitconvert (i_frag addr:$src2))))]>, VEX_4V, Sched<[WriteFShuffleLd, ReadAfterLd]>; - def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + let Predicates = [HasAVX, NoVLX] in { + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; - def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; + }// Predicates = [HasAVX, NoVLX] } let ExeDomain = SSEPackedSingle in { @@ -8106,7 +8097,7 @@ let ExeDomain = SSEPackedDouble in { loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), (VPERMILPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), @@ -8245,11 +8236,11 @@ let Predicates = [HasF16C] in { def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), (VCVTPH2PSrm addr:$src)>; - def : Pat<(store (f64 (vector_extract (bc_v2f64 (v8i16 + def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (i64 (vector_extract (bc_v2i64 (v8i16 + def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; @@ -8309,97 +8300,62 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, // multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, - Intrinsic Int128, Intrinsic Int256> { - def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + ValueType OpVT128, ValueType OpVT256, Predicate prd> { + let Predicates = [HasAVX2, prd] in { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int128 VR128:$src))]>, + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle]>, VEX; - def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, + (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX; - def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int256 VR128:$src))]>, + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; - def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, + (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX, VEX_L; + + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), + (!cast<Instruction>(NAME#"Yrr") + (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; + } } defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, - int_x86_avx2_pbroadcastb_128, - int_x86_avx2_pbroadcastb_256>; + v16i8, v32i8, NoVLX_Or_NoBWI>; defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, - int_x86_avx2_pbroadcastw_128, - int_x86_avx2_pbroadcastw_256>; + v8i16, v16i16, NoVLX_Or_NoBWI>; defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, - int_x86_avx2_pbroadcastd_128, - int_x86_avx2_pbroadcastd_256>; + v4i32, v8i32, NoVLX>; defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, - int_x86_avx2_pbroadcastq_128, - int_x86_avx2_pbroadcastq_256>; + v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2] in { - def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBrm addr:$src)>; - def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBYrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQYrm addr:$src)>; - - def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBrr VR128:$src)>; - def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBYrr VR128:$src)>; - def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWrr VR128:$src)>; - def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWYrr VR128:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDrr VR128:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDYrr VR128:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQYrr VR128:$src)>; - def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSrr VR128:$src)>; - def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSYrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), - (VBROADCASTSDYrr VR128:$src)>; + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; // Provide aliases for broadcast from the same register class that // automatically does the extract. - def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), - (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), - sub_xmm)))>; - def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), - (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), - sub_xmm)))>; - def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), - (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), - sub_xmm)))>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), - (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), - sub_xmm)))>; def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))>; @@ -8598,7 +8554,7 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, @@ -8722,16 +8678,16 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; -def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), @@ -8776,10 +8732,10 @@ def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0) (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), VR128:$mask)>; -def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), @@ -8804,10 +8760,10 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0) (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), VR256:$mask)>; -def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; -def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), @@ -8865,12 +8821,13 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; } -defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; -defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; -defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; -defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; -defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; - +let Predicates = [HasAVX2, NoVLX] in { + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; +} //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, @@ -8905,3 +8862,59 @@ let mayLoad = 1, Constraints defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; } } + +//===----------------------------------------------------------------------===// +// Extra selection patterns for FR128, f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +def : Pat<(store (f128 FR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; + +def : Pat<(loadf128 addr:$src), + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; + +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fand FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(and FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86for FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(or FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fxor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(xor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index caecf7001ef5..c1df9780a0e0 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -31,21 +31,21 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] -def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. -def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "shl{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "shl{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -85,19 +85,19 @@ def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src), "shl{b}\t{$src, $dst|$dst, $src}", [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src), +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src), "shl{w}\t{$src, $dst|$dst, $src}", [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src), +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src), "shl{l}\t{$src, $dst|$dst, $src}", [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src), +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src), "shl{q}\t{$src, $dst|$dst, $src}", [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -137,18 +137,18 @@ def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } -def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2), "shr{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shr{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "shr{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "shr{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -185,19 +185,19 @@ def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src), "shr{b}\t{$src, $dst|$dst, $src}", [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src), +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src), "shr{w}\t{$src, $dst|$dst, $src}", [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src), +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src), "shr{l}\t{$src, $dst|$dst, $src}", [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src), +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src), "shr{q}\t{$src, $dst|$dst, $src}", [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -241,20 +241,20 @@ def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), IIC_SR>; } -def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "sar{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "sar{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "sar{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "sar{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -298,19 +298,19 @@ def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src), +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src), "sar{b}\t{$src, $dst|$dst, $src}", [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src), +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src), "sar{w}\t{$src, $dst|$dst, $src}", [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src), +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src), "sar{l}\t{$src, $dst|$dst, $src}", [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src), +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src), "sar{q}\t{$src, $dst|$dst, $src}", [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -342,7 +342,7 @@ let hasSideEffects = 0 in { let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t$dst", [], IIC_SR>; -def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), @@ -350,7 +350,7 @@ def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; -def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), @@ -358,7 +358,7 @@ def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; -def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), @@ -367,7 +367,7 @@ def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "rcl{q}\t$dst", [], IIC_SR>; -def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), @@ -376,7 +376,7 @@ def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), "rcr{b}\t$dst", [], IIC_SR>; -def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), @@ -384,7 +384,7 @@ def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; -def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), @@ -392,7 +392,7 @@ def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; -def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), @@ -400,7 +400,7 @@ def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; -def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), @@ -411,36 +411,36 @@ def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), "rcl{b}\t$dst", [], IIC_SR>; -def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; -def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt), +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; -def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt), +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), "rcl{q}\t$dst", [], IIC_SR>; -def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt), +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), "rcr{b}\t$dst", [], IIC_SR>; -def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt), +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; -def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt), +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; -def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt), +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), "rcr{q}\t$dst", [], IIC_SR>; -def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in { @@ -482,19 +482,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } -def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -537,19 +537,19 @@ def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1), +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1), "rol{b}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>; -def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1), +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1), "rol{w}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>, OpSize16; -def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1), +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1), "rol{l}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>, OpSize32; -def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1), +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1), "rol{q}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>; @@ -589,19 +589,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } -def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "ror{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "ror{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -644,19 +644,19 @@ def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src), +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src), "ror{b}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src), +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src), "ror{w}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src), +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src), "ror{l}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src), +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), "ror{q}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -727,42 +727,42 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), let isCommutable = 1 in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (outs GR16:$dst), - (ins GR16:$src1, GR16:$src2, i8imm:$src3), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, (outs GR16:$dst), - (ins GR16:$src1, GR16:$src2, i8imm:$src3), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2, i8imm:$src3), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2, i8imm:$src3), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2, i8imm:$src3), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2, i8imm:$src3), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, @@ -801,14 +801,14 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), } def SHLD16mri8 : Ii8<0xA4, MRMDestMem, - (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD16_MEM_IM>, TB, OpSize16; def SHRD16mri8 : Ii8<0xAC, MRMDestMem, - (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, (i8 imm:$src3)), addr:$dst)], @@ -816,14 +816,14 @@ def SHRD16mri8 : Ii8<0xAC, MRMDestMem, TB, OpSize16; def SHLD32mri8 : Ii8<0xA4, MRMDestMem, - (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD32_MEM_IM>, TB, OpSize32; def SHRD32mri8 : Ii8<0xAC, MRMDestMem, - (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, (i8 imm:$src3)), addr:$dst)], @@ -831,14 +831,14 @@ def SHRD32mri8 : Ii8<0xAC, MRMDestMem, TB, OpSize32; def SHLD64mri8 : RIi8<0xA4, MRMDestMem, - (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD64_MEM_IM>, TB; def SHRD64mri8 : RIi8<0xAC, MRMDestMem, - (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, (i8 imm:$src3)), addr:$dst)], @@ -860,12 +860,12 @@ def ROT64L2R_imm8 : SDNodeXForm<imm, [{ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { let hasSideEffects = 0 in { - def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShift]>; let mayLoad = 1 in def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + (ins x86memop:$src1, u8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShiftLd]>; } diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 0350566f8b9b..85e17f516f91 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -44,7 +44,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", let SchedRW = [WriteSystem] in { -def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", +def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)], IIC_INT>; @@ -60,12 +60,6 @@ def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [], IIC_SYS_ENTER_EXIT>, TB; def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [], IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>; - -def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16; -def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>, - OpSize32; -def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, - Requires<[In64BitMode]>; } // SchedRW def : Pat<(debugtrap), @@ -88,13 +82,13 @@ def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32; let Defs = [AL] in -def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port), "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; let Defs = [AX] in -def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16; let Defs = [EAX] in -def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32; let Uses = [DX, AL] in @@ -108,13 +102,13 @@ def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32; let Uses = [AL] in -def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port), "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; let Uses = [AX] in -def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16; let Uses = [EAX] in -def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32; } // SchedRW @@ -478,39 +472,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; //===----------------------------------------------------------------------===// // XSAVE instructions let SchedRW = [WriteSystem] in { +let Predicates = [HasXSAVE] in { let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; let Uses = [EDX, EAX, ECX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; +} -let Uses = [RDX, RAX] in { - def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave\t$dst", []>, TB; - def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>; +let Uses = [EDX, EAX] in { +let Predicates = [HasXSAVE] in { + def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave\t$dst", + [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB; + def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave64\t$dst", + [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor\t$dst", []>, TB; + "xrstor\t$dst", + [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB; def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt\t$dst", []>, PS; - def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>; - + "xrstor64\t$dst", + [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEOPT] in { + def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt\t$dst", + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB; + def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt64\t$dst", + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEC] in { + def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec\t$dst", + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB; + def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec64\t$dst", + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVES] in { + def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves\t$dst", + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB; + def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves64\t$dst", + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors\t$dst", []>, TB; + "xrstors\t$dst", + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB; def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), - "xsavec\t$dst", []>, TB; - def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), - "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), - "xsaves\t$dst", []>, TB; - def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), - "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>; + "xrstors64\t$dst", + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; } +} // Uses } // SchedRW //===----------------------------------------------------------------------===// @@ -534,6 +549,12 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { } let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; +//==-----------------------------------------------------------------------===// +// PKU - enable protection key +let Defs = [EAX, EDX], Uses = [ECX] in + def RDPKRU : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; +let Uses = [EAX, ECX, EDX] in + def WRPKRU : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; //===----------------------------------------------------------------------===// // FS/GS Base Instructions diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 8455b8d8467c..4cb2304e464d 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -83,57 +83,64 @@ let ExeDomain = SSEPackedDouble in { defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; } -multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3; + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>, - XOP_4V, VEX_W; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>, - XOP_4VOp3; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), + (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { - defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; - defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; - defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; - defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; - defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; - defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; - defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; - defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; - defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; - defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; - defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; - defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; + defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>; + defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>; + defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>; + defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>; + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>; } -multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP; + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP; } let ExeDomain = SSEPackedInt in { - defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; - defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; - defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>; } // Instruction where second source can be memory, but third must be register @@ -170,30 +177,34 @@ let ExeDomain = SSEPackedInt in { } // Instruction where second source can be memory, third must be imm8 -multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { +multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> { let isCommutable = 1 in def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>, + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + i8immZExt3:$cc)))]>, XOP_4V; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), - i8immZExt3:$cc))]>, XOP_4V; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))), + i8immZExt3:$cc)))]>, + XOP_4V; let isAsmParserOnly = 1, hasSideEffects = 0 in { def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V; let mayLoad = 1 in def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V; @@ -201,14 +212,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { } let ExeDomain = SSEPackedInt in { // SSE integer instructions - defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; - defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; - defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; - defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; - defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; - defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; - defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; - defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; + defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>; + defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>; + defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>; + defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>; + defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>; + defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>; + defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>; } // Instruction where either second or third source can be memory @@ -270,42 +281,52 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { let ExeDomain = SSEPackedInt in defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let Predicates = [HasXOP] in { + def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>; +} + multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4), + (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, VEX_W, MemOp4; def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4), + (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, VEX_W, MemOp4, VEX_L; def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 2c8b95bcba22..dc6d85d582c8 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -18,14 +18,19 @@ namespace llvm { enum IntrinsicType { INTR_NO_TYPE, - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, - CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, - INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, - VPERM_3OP_MASKZ, - INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, - EXPAND_FROM_MEM, BLEND + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, + CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, + INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, + INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, + FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, + VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, + INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, + TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, + EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; struct IntrinsicData { @@ -138,6 +143,42 @@ static const IntrinsicData IntrinsicsWithChain[] = { EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -209,6 +250,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -241,6 +284,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), @@ -274,16 +318,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, @@ -371,6 +455,50 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -388,6 +516,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), + X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -415,7 +547,184 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - + X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, X86ISD::VFPROUND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, X86ISD::VFPEXT), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTUDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, @@ -452,6 +761,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, @@ -464,6 +781,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, @@ -472,10 +845,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, @@ -484,10 +857,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSD, 0), + X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSS, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, @@ -554,6 +949,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), @@ -596,6 +997,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), @@ -644,6 +1057,114 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, @@ -700,6 +1221,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), @@ -728,16 +1255,98 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::RNDSCALE, 0), + X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::RNDSCALE, 0), + X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, @@ -750,6 +1359,38 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, @@ -758,6 +1399,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, @@ -782,9 +1427,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), - + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, @@ -821,7 +1511,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK, @@ -852,54 +1541,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK, @@ -910,7 +1601,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD, @@ -959,14 +1661,59 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0), + X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), @@ -1017,6 +1764,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), @@ -1024,6 +1773,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), @@ -1066,12 +1816,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), @@ -1105,7 +1849,31 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0) + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0) }; /* @@ -1128,6 +1896,102 @@ static void verifyIntrinsicTables() { std::is_sorted(std::begin(IntrinsicsWithChain), std::end(IntrinsicsWithChain)) && "Intrinsic data tables should be sorted by Intrinsic ID"); + assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) == + std::end(IntrinsicsWithoutChain)) && + (std::adjacent_find(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) == + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should have unique entries"); +} + +// X86 specific compare constants. +// They must be kept in synch with avxintrin.h +#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ +#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ +#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ +#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ +#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ +#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ +#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ +#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ +#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ +#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ +#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ +#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ +#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ +#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ +#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ +#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */ + +/* +* Get comparison modifier from _mm_comi_round_sd/ss intrinsic +* Return tuple <isOrdered, X86 condcode> +*/ +static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) { + ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm); + unsigned IntImm = CImm->getZExtValue(); + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (IntImm) { + default: llvm_unreachable("Invalid floating point compare value for Comi!"); + case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling) + case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling) + return std::make_tuple(true, X86::COND_E); + case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling) + case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling) + return std::make_tuple(false , X86::COND_E); + case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling) + case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_B); + case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling) + case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false , X86::COND_B); + case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling) + case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_BE); + case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling) + case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_BE); + case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling) + case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_A); + case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling) + case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_A); + case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling) + case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_AE); + case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling) + case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_AE); + case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling) + case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling) + return std::make_tuple(true, X86::COND_NE); + case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling) + case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling) + return std::make_tuple(false, X86::COND_NE); + } } } // End llvm namespace diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 3415cedc6fea..e186f7039b43 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -92,7 +92,6 @@ namespace llvm { SmallVector<MCFixup, 4> Fixups; raw_svector_ostream VecOS(Code); CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); CurrentShadowSize += Code.size(); if (CurrentShadowSize >= RequiredShadowSize) InShadow = false; // The shadow is big enough. Stop counting. @@ -128,7 +127,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = MF.getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); MCSymbol *Sym = nullptr; @@ -151,7 +150,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { } if (!Suffix.empty()) - Name += DL->getPrivateGlobalPrefix(); + Name += DL.getPrivateGlobalPrefix(); unsigned PrefixLen = Name.size(); @@ -159,7 +158,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { const GlobalValue *GV = MO.getGlobal(); AsmPrinter.getNameWithPrefix(Name, GV); } else if (MO.isSymbol()) { - Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); } else if (MO.isMBB()) { assert(Suffix.empty()); Sym = MO.getMBB()->getSymbol(); @@ -461,6 +460,7 @@ ReSimplify: // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B // if one of the registers is extended, but other isn't. + case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: case X86::VMOVAPSrr: @@ -478,18 +478,19 @@ ReSimplify: unsigned NewOpc; switch (OutMI.getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; - case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; - case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; - case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; - case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; - case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; - case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; - case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; - case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; - case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; - case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; - case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; } OutMI.setOpcode(NewOpc); } @@ -532,6 +533,23 @@ ReSimplify: break; } + case X86::CLEANUPRET: { + // Replace CATCHRET with the appropriate RET. + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget())); + break; + } + + case X86::CATCHRET: { + // Replace CATCHRET with the appropriate RET. + const X86Subtarget &Subtarget = AsmPrinter.getSubtarget(); + unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(Subtarget)); + OutMI.addOperand(MCOperand::createReg(ReturnReg)); + break; + } + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. case X86::TAILJMPr: case X86::TAILJMPd: @@ -598,17 +616,29 @@ ReSimplify: case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; @@ -875,7 +905,10 @@ void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI, MCInst LoadMI; LoadMI.setOpcode(LoadOpcode); - LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + + if (LoadDefRegister != X86::NoRegister) + LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + for (auto I = MI.operands_begin() + LoadOperandsBeginIdx, E = MI.operands_end(); I != E; ++I) @@ -1062,6 +1095,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86ATTInstPrinter::getRegisterName(Reg)); break; } + case X86::CLEANUPRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CLEANUPRET"); + break; + } + + case X86::CATCHRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CATCHRET"); + break; + } + case X86::TAILJMPr: case X86::TAILJMPm: case X86::TAILJMPd: @@ -1095,12 +1140,30 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32) .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + const X86FrameLowering* FrameLowering = + MF->getSubtarget<X86Subtarget>().getFrameLowering(); + bool hasFP = FrameLowering->hasFP(*MF); + + // TODO: This is needed only if we require precise CFA. + bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() && + !OutStreamer->getDwarfFrameInfos().back().End; + + int stackGrowth = -RI->getSlotSize(); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth); + } + // Emit the label. OutStreamer->EmitLabel(PICBase); // popl $reg EmitAndCountInstruction(MCInstBuilder(X86::POP32r) .addReg(MI->getOperand(0).getReg())); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth); + } return; } @@ -1206,19 +1269,48 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - // Lower PSHUFB and VPERMILP normally but add a comment if we can find - // a constant shuffle mask. We won't be able to do this at the MC layer - // because the mask isn't an immediate. + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. case X86::PSHUFBrm: case X86::VPSHUFBrm: - case X86::VPSHUFBYrm: { + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrm: + case X86::VPSHUFBZrmk: + case X86::VPSHUFBZrmkz: { if (!OutStreamer->isVerboseAsm()) break; - assert(MI->getNumOperands() > 5 && - "We should always have at least 5 operands!"); + unsigned SrcIdx, MaskIdx; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZrm: + SrcIdx = 1; MaskIdx = 5; break; + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrmkz: + SrcIdx = 2; MaskIdx = 6; break; + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZrmk: + SrcIdx = 3; MaskIdx = 7; break; + } + + assert(MI->getNumOperands() >= 6 && + "We should always have at least 6 operands!"); const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp = MI->getOperand(1); - const MachineOperand &MaskOp = MI->getOperand(5); + const MachineOperand &SrcOp = MI->getOperand(SrcIdx); + const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; @@ -1240,35 +1332,53 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &SrcOp = MI->getOperand(1); const MachineOperand &MaskOp = MI->getOperand(5); + unsigned ElSize; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break; + case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break; + } + if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; - DecodeVPERMILPMask(C, Mask); + DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); } break; } - // For loads from a constant pool to a vector register, print the constant - // loaded. - case X86::MOVAPDrm: - case X86::VMOVAPDrm: - case X86::VMOVAPDYrm: - case X86::MOVUPDrm: - case X86::VMOVUPDrm: - case X86::VMOVUPDYrm: - case X86::MOVAPSrm: - case X86::VMOVAPSrm: - case X86::VMOVAPSYrm: - case X86::MOVUPSrm: - case X86::VMOVUPSrm: - case X86::VMOVUPSYrm: - case X86::MOVDQArm: - case X86::VMOVDQArm: - case X86::VMOVDQAYrm: - case X86::MOVDQUrm: - case X86::VMOVDQUrm: - case X86::VMOVDQUYrm: +#define MOV_CASE(Prefix, Suffix) \ + case X86::Prefix##MOVAPD##Suffix##rm: \ + case X86::Prefix##MOVAPS##Suffix##rm: \ + case X86::Prefix##MOVUPD##Suffix##rm: \ + case X86::Prefix##MOVUPS##Suffix##rm: \ + case X86::Prefix##MOVDQA##Suffix##rm: \ + case X86::Prefix##MOVDQU##Suffix##rm: + +#define MOV_AVX512_CASE(Suffix) \ + case X86::VMOVDQA64##Suffix##rm: \ + case X86::VMOVDQA32##Suffix##rm: \ + case X86::VMOVDQU64##Suffix##rm: \ + case X86::VMOVDQU32##Suffix##rm: \ + case X86::VMOVDQU16##Suffix##rm: \ + case X86::VMOVDQU8##Suffix##rm: \ + case X86::VMOVAPS##Suffix##rm: \ + case X86::VMOVAPD##Suffix##rm: \ + case X86::VMOVUPS##Suffix##rm: \ + case X86::VMOVUPD##Suffix##rm: + +#define CASE_ALL_MOV_RM() \ + MOV_CASE(, ) /* SSE */ \ + MOV_CASE(V, ) /* AVX-128 */ \ + MOV_CASE(V, Y) /* AVX-256 */ \ + MOV_AVX512_CASE(Z) \ + MOV_AVX512_CASE(Z256) \ + MOV_AVX512_CASE(Z128) + + // For loads from a constant pool to a vector register, print the constant + // loaded. + CASE_ALL_MOV_RM() if (!OutStreamer->isVerboseAsm()) break; if (MI->getNumOperands() > 4) @@ -1302,7 +1412,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (isa<UndefValue>(COp)) { CS << "u"; } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { - CS << CI->getZExtValue(); + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + auto Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { SmallString<32> Str; CF->getValueAPF().toString(Str); diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp index ac2cdc8c6567..c9e636f1eb00 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===// +//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===// // // The LLVM Compiler Infrastructure // diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index e6db9708b677..3a7a98db50f4 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -84,8 +84,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// of pushes to pass function parameters. bool HasPushSequences = false; - /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill - /// slot for the frame pointer. + /// True if the function recovers from an SEH exception, and therefore needs + /// to spill and restore the frame pointer. bool HasSEHFramePtrSave = false; /// The frame index of a stack object containing the original frame pointer @@ -100,7 +100,7 @@ private: public: X86MachineFunctionInfo() = default; - explicit X86MachineFunctionInfo(MachineFunction &MF) {}; + explicit X86MachineFunctionInfo(MachineFunction &MF) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp new file mode 100644 index 000000000000..58020d909a43 --- /dev/null +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -0,0 +1,326 @@ +//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass that performs some optimizations with LEA +// instructions in order to improve code size. +// Currently, it does one thing: +// 1) Address calculations in load and store instructions are replaced by +// existing LEA def registers where possible. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-optimize-LEAs" + +static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden, + cl::desc("X86: Enable LEA optimizations."), + cl::init(false)); + +STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); + +namespace { +class OptimizeLEAPass : public MachineFunctionPass { +public: + OptimizeLEAPass() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { return "X86 LEA Optimize"; } + + /// \brief Loop over all of the basic blocks, replacing address + /// calculations in load and store instructions, if it's already + /// been calculated by LEA. Also, remove redundant LEAs. + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + /// \brief Returns a distance between two instructions inside one basic block. + /// Negative result means, that instructions occur in reverse order. + int calcInstrDist(const MachineInstr &First, const MachineInstr &Last); + + /// \brief Choose the best \p LEA instruction from the \p List to replace + /// address calculation in \p MI instruction. Return the address displacement + /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift + /// and \p Dist. + bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist); + + /// \brief Returns true if two machine operand are identical and they are not + /// physical registers. + bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2); + + /// \brief Returns true if the instruction is LEA. + bool isLEA(const MachineInstr &MI); + + /// \brief Returns true if two instructions have memory operands that only + /// differ by displacement. The numbers of the first memory operands for both + /// instructions are specified through \p N1 and \p N2. The address + /// displacement is returned through AddrDispShift. + bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift); + + /// \brief Find all LEA instructions in the basic block. + void findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List); + + /// \brief Removes redundant address calculations. + bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List); + + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; + + static char ID; +}; +char OptimizeLEAPass::ID = 0; +} + +FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } + +int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, + const MachineInstr &Last) { + const MachineBasicBlock *MBB = First.getParent(); + + // Both instructions must be in the same basic block. + assert(Last.getParent() == MBB && + "Instructions are in different basic blocks"); + + return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) - + std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First)); +} + +// Find the best LEA instruction in the List to replace address recalculation in +// MI. Such LEA must meet these requirements: +// 1) The address calculated by the LEA differs only by the displacement from +// the address used in MI. +// 2) The register class of the definition of the LEA is compatible with the +// register class of the address base register of MI. +// 3) Displacement of the new memory operand should fit in 1 byte if possible. +// 4) The LEA should be as close to MI as possible, and prior to it if +// possible. +bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist) { + const MachineFunction *MF = MI.getParent()->getParent(); + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) + + X86II::getOperandBias(Desc); + + LEA = nullptr; + + // Loop over all LEA instructions. + for (auto DefMI : List) { + int64_t AddrDispShiftTemp = 0; + + // Compare instructions memory operands. + if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp)) + continue; + + // Make sure address displacement fits 4 bytes. + if (!isInt<32>(AddrDispShiftTemp)) + continue; + + // Check that LEA def register can be used as MI address base. Some + // instructions can use a limited set of registers as address base, for + // example MOV8mr_NOREX. We could constrain the register class of the LEA + // def to suit MI, however since this case is very rare and hard to + // reproduce in a test it's just more reliable to skip the LEA. + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) != + MRI->getRegClass(DefMI->getOperand(0).getReg())) + continue; + + // Choose the closest LEA instruction from the list, prior to MI if + // possible. Note that we took into account resulting address displacement + // as well. Also note that the list is sorted by the order in which the LEAs + // occur, so the break condition is pretty simple. + int DistTemp = calcInstrDist(*DefMI, MI); + assert(DistTemp != 0 && + "The distance between two different instructions cannot be zero"); + if (DistTemp > 0 || LEA == nullptr) { + // Do not update return LEA, if the current one provides a displacement + // which fits in 1 byte, while the new candidate does not. + if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) && + isInt<8>(AddrDispShift)) + continue; + + LEA = DefMI; + AddrDispShift = AddrDispShiftTemp; + Dist = DistTemp; + } + + // FIXME: Maybe we should not always stop at the first LEA after MI. + if (DistTemp < 0) + break; + } + + return LEA != nullptr; +} + +bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1, + const MachineOperand &MO2) { + return MO1.isIdenticalTo(MO2) && + (!MO1.isReg() || + !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); +} + +bool OptimizeLEAPass::isLEA(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +// Check if MI1 and MI2 have memory operands which represent addresses that +// differ only by displacement. +bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift) { + // Address base, scale, index and segment operands must be identical. + static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt, + X86::AddrIndexReg, X86::AddrSegmentReg}; + for (auto &N : IdenticalOpNums) + if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N))) + return false; + + // Address displacement operands may differ by a constant. + const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp); + const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp); + if (!isIdenticalOp(*Op1, *Op2)) { + if (Op1->isImm() && Op2->isImm()) + AddrDispShift = Op1->getImm() - Op2->getImm(); + else if (Op1->isGlobal() && Op2->isGlobal() && + Op1->getGlobal() == Op2->getGlobal()) + AddrDispShift = Op1->getOffset() - Op2->getOffset(); + else + return false; + } + + return true; +} + +void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List) { + for (auto &MI : MBB) { + if (isLEA(MI)) + List.push_back(const_cast<MachineInstr *>(&MI)); + } +} + +// Try to find load and store instructions which recalculate addresses already +// calculated by some LEA and replace their memory operands with its def +// register. +bool OptimizeLEAPass::removeRedundantAddrCalc( + const SmallVectorImpl<MachineInstr *> &List) { + bool Changed = false; + + assert(List.size() > 0); + MachineBasicBlock *MBB = List[0]->getParent(); + + // Process all instructions in basic block. + for (auto I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr &MI = *I++; + unsigned Opcode = MI.getOpcode(); + + // Instruction must be load or store. + if (!MI.mayLoadOrStore()) + continue; + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode); + + // If instruction has no memory operand - skip it. + if (MemOpNo < 0) + continue; + + MemOpNo += X86II::getOperandBias(Desc); + + // Get the best LEA instruction to replace address calculation. + MachineInstr *DefMI; + int64_t AddrDispShift; + int Dist; + if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist)) + continue; + + // If LEA occurs before current instruction, we can freely replace + // the instruction. If LEA occurs after, we can lift LEA above the + // instruction and this way to be able to replace it. Since LEA and the + // instruction have similar memory operands (thus, the same def + // instructions for these operands), we can always do that, without + // worries of using registers before their defs. + if (Dist < 0) { + DefMI->removeFromParent(); + MBB->insert(MachineBasicBlock::iterator(&MI), DefMI); + } + + // Since we can possibly extend register lifetime, clear kill flags. + MRI->clearKillFlags(DefMI->getOperand(0).getReg()); + + ++NumSubstLEAs; + DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump();); + + // Change instruction operands. + MI.getOperand(MemOpNo + X86::AddrBaseReg) + .ChangeToRegister(DefMI->getOperand(0).getReg(), false); + MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1); + MI.getOperand(MemOpNo + X86::AddrIndexReg) + .ChangeToRegister(X86::NoRegister, false); + MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift); + MI.getOperand(MemOpNo + X86::AddrSegmentReg) + .ChangeToRegister(X86::NoRegister, false); + + DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump();); + + Changed = true; + } + + return Changed; +} + +bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + + // Perform this optimization only if we care about code size. + if (!EnableX86LEAOpt || !MF.getFunction()->optForSize()) + return false; + + MRI = &MF.getRegInfo(); + TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + + // Process all basic blocks. + for (auto &MBB : MF) { + SmallVector<MachineInstr *, 16> LEAs; + + // Find all LEA instructions in basic block. + findLEAs(MBB, LEAs); + + // If current basic block has no LEAs, move on to the next one. + if (LEAs.empty()) + continue; + + // Remove redundant address calculations. + Changed |= removeRedundantAddrCalc(LEAs); + } + + return Changed; +} diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 143e70bda9e7..0f425e28fa7d 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -93,8 +93,7 @@ FunctionPass *llvm::createX86PadShortFunctions() { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize)) { + if (MF.getFunction()->optForSize()) { return false; } @@ -107,7 +106,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); - findReturns(MF.begin()); + findReturns(&MF.front()); bool MadeChange = false; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index d8495e53e0e3..58404433e1ae 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -27,7 +27,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" @@ -44,12 +43,6 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" -cl::opt<bool> -ForceStackAlign("force-align-stack", - cl::desc("Force align the stack to the minimum alignment" - " needed for the function."), - cl::init(false), cl::Hidden); - static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); @@ -174,21 +167,34 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; - case 2: // Available for tailcall (not callee-saved GPRs). - const Function *F = MF.getFunction(); - if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) - return &X86::GR64_TCW64RegClass; - else if (Is64Bit) - return &X86::GR64_TCRegClass; - - bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); - if (hasHipeCC) - return &X86::GR32RegClass; - return &X86::GR32_TCRegClass; + case 2: // NOREX GPRs. + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREXRegClass; + return &X86::GR32_NOREXRegClass; + case 3: // NOREX GPRs except the stack pointer (for encoding reasons). + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREX_NOSPRegClass; + return &X86::GR32_NOREX_NOSPRegClass; + case 4: // Available for tailcall (not callee-saved GPRs). + return getGPRsForTailCall(MF); } } const TargetRegisterClass * +X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { + const Function *F = MF.getFunction(); + if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) + return &X86::GR64_TCW64RegClass; + else if (Is64Bit) + return &X86::GR64_TCRegClass; + + bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + if (hasHipeCC) + return &X86::GR32RegClass; + return &X86::GR32_TCRegClass; +} + +const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) @@ -222,6 +228,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); bool CallsEHReturn = MF->getMMI().callsEHReturn(); @@ -241,6 +248,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (HasAVX) return CSR_64_RT_AllRegs_AVX_SaveList; return CSR_64_RT_AllRegs_SaveList; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_SaveList; + break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; @@ -254,6 +265,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_64_Intel_OCL_BI_SaveList; break; } + case CallingConv::HHVM: + return CSR_64_HHVM_SaveList; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; @@ -264,6 +277,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (CallsEHReturn) return CSR_64EHRet_SaveList; return CSR_64_SaveList; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_SaveList; + else + return CSR_64_AllRegs_SaveList; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_SaveList; + else + return CSR_32_AllRegs_SaveList; + } default: break; } @@ -284,6 +309,7 @@ const uint32_t * X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -301,6 +327,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (HasAVX) return CSR_64_RT_AllRegs_AVX_RegMask; return CSR_64_RT_AllRegs_RegMask; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_RegMask; + break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; @@ -314,16 +344,30 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_Intel_OCL_BI_RegMask; break; } + case CallingConv::HHVM: + return CSR_64_HHVM_RegMask; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; break; - default: - break; case CallingConv::X86_64_Win64: return CSR_Win64_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_RegMask; + else + return CSR_64_AllRegs_RegMask; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_RegMask; + else + return CSR_32_AllRegs_RegMask; + } + default: + break; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check @@ -341,6 +385,10 @@ X86RegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } +const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const { + return CSR_64_TLS_Darwin_RegMask; +} + BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const X86FrameLowering *TFI = getFrameLowering(MF); @@ -371,8 +419,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64, - false); + unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); @@ -439,6 +486,10 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// +static bool CantUseSP(const MachineFrameInfo *MFI) { + return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); +} + bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -451,13 +502,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // reference locals while also adjusting the stack pointer. When we can't // use both the SP and the FP, we need a separate base pointer register. bool CantUseFP = needsStackRealignment(MF); - bool CantUseSP = - MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); - return CantUseFP && CantUseSP; + return CantUseFP && CantUseSP(MFI); } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + if (!TargetRegisterInfo::canRealignStack(MF)) return false; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -470,26 +519,11 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { // If a base pointer is necessary. Check that it isn't too late to reserve // it. - if (MFI->hasVarSizedObjects()) + if (CantUseSP(MFI)) return MRI->canReserveReg(BasePtr); return true; } -bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86FrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - // If we've requested that we force align the stack do so now. - if (ForceStackAlign) - return canRealignStack(MF); - - return requiresRealignment && canRealignStack(MF); -} - bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { // Since X86 defines assignCalleeSavedSpillSlots which always return true @@ -510,6 +544,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Opc = MI.getOpcode(); bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; + if (hasBasePointer(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); else if (needsStackRealignment(MF)) @@ -524,14 +559,11 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. + unsigned IgnoredFrameReg; if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int Offset; - if (IsWinEH) - Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex); - else - Offset = TFI->getFrameIndexOffset(MF, FrameIndex); + Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); FI.ChangeToImmediate(Offset); return; } @@ -540,7 +572,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) - BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false); + BasePtr = getX86SubSuperRegister(BasePtr, 64); // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. @@ -553,7 +585,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, const MachineFrameInfo *MFI = MF.getFrameInfo(); FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); } else - FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); if (BasePtr == StackPtr) FIOffset += SPAdj; @@ -592,193 +624,11 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) - FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); + FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } -namespace llvm { -unsigned getX86SubSuperRegisterOrZero(unsigned Reg, MVT::SimpleValueType VT, - bool High) { - switch (VT) { - default: return 0; - case MVT::i8: - if (High) { - switch (Reg) { - default: return getX86SubSuperRegister(Reg, MVT::i64); - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SP; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AH; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DH; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CH; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BH; - } - } else { - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AL; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DL; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CL; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BL; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SIL; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DIL; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BPL; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SPL; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8B; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9B; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10B; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11B; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12B; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13B; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14B; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15B; - } - } - case MVT::i16: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8W; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9W; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10W; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11W; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12W; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13W; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14W; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15W; - } - case MVT::i32: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::EAX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::EDX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::ECX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::EBX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::ESI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::EDI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::EBP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::ESP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8D; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9D; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10D; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11D; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12D; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13D; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14D; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15D; - } - case MVT::i64: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::RAX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::RDX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::RCX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::RBX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::RSI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::RDI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::RBP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::RSP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15; - } - } -} - -unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, - bool High) { - unsigned Res = getX86SubSuperRegisterOrZero(Reg, VT, High); - if (Res == 0) - llvm_unreachable("Unexpected register or VT"); - return Res; -} - -unsigned get512BitSuperRegister(unsigned Reg) { +unsigned llvm::get512BitSuperRegister(unsigned Reg) { if (Reg >= X86::XMM0 && Reg <= X86::XMM31) return X86::ZMM0 + (Reg - X86::XMM0); if (Reg >= X86::YMM0 && Reg <= X86::YMM31) @@ -787,5 +637,3 @@ unsigned get512BitSuperRegister(unsigned Reg) { return Reg; llvm_unreachable("Unexpected SIMD register"); } - -} diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 8de1d0bf8ec8..f014c8f6ff61 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -87,6 +87,11 @@ public: const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + /// getGPRsForTailCall - Returns a register class with registers that can be + /// used in forming tail calls. + const TargetRegisterClass * + getGPRsForTailCall(const MachineFunction &MF) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -96,7 +101,11 @@ public: getCalleeSavedRegs(const MachineFunction* MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; + + // Calls involved in thread-local variable lookup save more registers than + // normal calls, so they need a different mask to represent this. + const uint32_t *getDarwinTLSCallPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number /// indicating if a register is a special register that has particular uses and @@ -108,9 +117,7 @@ public: bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const override; @@ -128,16 +135,6 @@ public: unsigned getSlotSize() const { return SlotSize; } }; -/// Returns the sub or super register of a specific X86 register. -/// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) returns X86::AX. -/// Aborts on error. -unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); - -/// Returns the sub or super register of a specific X86 register. -/// Like getX86SubSuperRegister() but returns 0 on error. -unsigned getX86SubSuperRegisterOrZero(unsigned, MVT::SimpleValueType, - bool High = false); - //get512BitRegister - X86 utility - returns 512-bit super register unsigned get512BitSuperRegister(unsigned Reg); diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index cdb151c26a05..56f0d9352d30 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -225,15 +225,15 @@ let SubRegIndices = [sub_ymm] in { } } - // Mask Registers, used by AVX-512 instructions. - def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; - def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; - def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; - def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; - def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; - def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; - def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; - def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; +// Mask Registers, used by AVX-512 instructions. +def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; +def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; +def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; +def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; +def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; +def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; +def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; +def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That @@ -375,7 +375,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R11, RIP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, - R8, R9, R11)>; + R8, R9, R10, R11, RIP)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, @@ -423,6 +423,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>; + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -442,10 +444,11 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { } // Generic vector registers: VR64 and VR128. +// Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], 128, (add FR32)>; -def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. @@ -459,8 +462,8 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, - (sequence "ZMM%u", 0, 31)>; +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 31)>; // Scalar AVX-512 floating point registers. def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; @@ -468,10 +471,10 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (add FR32X)>; -def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - 256, (sequence "YMM%u", 0, 31)>; +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 31)>; // Mask registers def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} @@ -491,4 +494,4 @@ def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} // Bound registers -def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
\ No newline at end of file +def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index ce79fcf9ad81..b1a01614b4a1 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( return false; } -SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, - MachinePointerInfo DstPtrInfo) const { +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget<X86Subtarget>(); @@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); - if (const char *bzeroEntry = V && + if (const char *bzeroEntry = V && V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { - EVT IntPtr = - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, 0) .setDiscardResult(); - std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI); + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } @@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, BytesLeft = SizeVal % UBytes; } - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), - InFlag); + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), + InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; @@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, CVT)); - Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : - X86::ECX, - Left, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Left, InFlag); InFlag = Chain.getValue(1); Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; @@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : - X86::ESI, - Src, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, + Src, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index dff3624b7efe..8ef08c960f0b 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -44,9 +44,8 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, cl::desc("Enable early if-conversion on X86")); -/// ClassifyBlockAddressReference - Classify a blockaddress reference for the -/// current subtarget according to how we should reference it in a non-pcrel -/// context. +/// Classify a blockaddress reference for the current subtarget according to how +/// we should reference it in a non-pcrel context. unsigned char X86Subtarget::ClassifyBlockAddressReference() const { if (isPICStyleGOT()) // 32-bit ELF targets. return X86II::MO_GOTOFF; @@ -58,9 +57,8 @@ unsigned char X86Subtarget::ClassifyBlockAddressReference() const { return X86II::MO_NO_FLAG; } -/// ClassifyGlobalReference - Classify a global variable reference for the -/// current subtarget according to how we should reference it in a non-pcrel -/// context. +/// Classify a global variable reference for the current subtarget according to +/// how we should reference it in a non-pcrel context. unsigned char X86Subtarget:: ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // DLLImport only exists on windows, it is implemented as a load from a @@ -147,9 +145,9 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { } -/// getBZeroEntry - This function returns the name of a function which has an -/// interface like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over memset with zero +/// This function returns the name of a function which has an interface like +/// the non-standard bzero function, if such a function exists on the +/// current subtarget and it is considered preferable over memset with zero /// passed as the second argument. Otherwise it returns null. const char *X86Subtarget::getBZeroEntry() const { // Darwin 10 has a __bzero entry point for this purpose. @@ -166,8 +164,7 @@ bool X86Subtarget::hasSinCos() const { is64Bit(); } -/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls -/// to immediate address. +/// Return true if the subtarget allows calls to immediate address. bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does, @@ -192,9 +189,25 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { FullFS = "+64bit,+sse2"; } + // LAHF/SAHF are always supported in non-64-bit mode. + if (!In64BitMode) { + if (!FullFS.empty()) + FullFS = "+sahf," + FullFS; + else + FullFS = "+sahf"; + } + + // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); + // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of + // 16-bytes and under that are reasonably fast. These features were + // introduced with Intel's Nehalem/Silvermont and AMD's Family10h + // micro-architectures respectively. + if (hasSSE42() || hasSSE4A()) + IsUAMem16Slow = false; + InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with @@ -224,13 +237,18 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { } void X86Subtarget::initializeEnvironment() { - X86SSELevel = NoMMXSSE; + X86SSELevel = NoSSE; X863DNowLevel = NoThreeDNow; HasCMov = false; HasX86_64 = false; HasPOPCNT = false; HasSSE4A = false; HasAES = false; + HasFXSR = false; + HasXSAVE = false; + HasXSAVEOPT = false; + HasXSAVEC = false; + HasXSAVES = false; HasPCLMUL = false; HasFMA = false; HasFMA4 = false; @@ -252,13 +270,15 @@ void X86Subtarget::initializeEnvironment() { HasBWI = false; HasVLX = false; HasADX = false; + HasPKU = false; HasSHA = false; HasPRFCHW = false; HasRDSEED = false; + HasLAHFSAHF = false; HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; - IsUAMemFast = false; + IsUAMem16Slow = false; IsUAMem32Slow = false; HasSSEUnalignedMem = false; HasCmpxchg16b = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index f026d4295f71..13d1026dcaa0 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -47,11 +47,11 @@ class X86Subtarget final : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F + NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F }; enum X863DNowEnum { - NoThreeDNow, ThreeDNow, ThreeDNowA + NoThreeDNow, MMX, ThreeDNow, ThreeDNowA }; enum X86ProcFamilyEnum { @@ -64,10 +64,10 @@ protected: /// Which PIC style to use PICStyles::Style PICStyle; - /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. + /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. X86SSEEnum X86SSELevel; - /// 3DNow, 3DNow Athlon, or none supported. + /// MMX, 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel; /// True if this processor has conditional move instructions @@ -86,6 +86,18 @@ protected: /// Target has AES instructions bool HasAES; + /// Target has FXSAVE/FXRESTOR instructions + bool HasFXSR; + + /// Target has XSAVE instructions + bool HasXSAVE; + /// Target has XSAVEOPT instructions + bool HasXSAVEOPT; + /// Target has XSAVEC instructions + bool HasXSAVEC; + /// Target has XSAVES instructions + bool HasXSAVES; + /// Target has carry-less multiplication bool HasPCLMUL; @@ -140,16 +152,19 @@ protected: /// Processor has RDSEED instructions. bool HasRDSEED; + /// Processor has LAHF/SAHF instructions. + bool HasLAHFSAHF; + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// True if unaligned memory access is fast. - bool IsUAMemFast; + /// True if unaligned memory accesses of 16-bytes are slow. + bool IsUAMem16Slow; - /// True if unaligned 32-byte memory accesses are slow. + /// True if unaligned memory accesses of 32-bytes are slow. bool IsUAMem32Slow; /// True if SSE operations can have unaligned memory operands. @@ -208,6 +223,9 @@ protected: /// Processor has AVX-512 Vector Length eXtenstions bool HasVLX; + /// Processor has PKU extenstions + bool HasPKU; + /// Processot supports MPX - Memory Protection Extensions bool HasMPX; @@ -319,7 +337,6 @@ public: void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } bool hasCMov() const { return HasCMov; } - bool hasMMX() const { return X86SSELevel >= MMX; } bool hasSSE1() const { return X86SSELevel >= SSE1; } bool hasSSE2() const { return X86SSELevel >= SSE2; } bool hasSSE3() const { return X86SSELevel >= SSE3; } @@ -332,14 +349,22 @@ public: bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } + bool hasMMX() const { return X863DNowLevel >= MMX; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } bool hasAES() const { return HasAES; } + bool hasFXSR() const { return HasFXSR; } + bool hasXSAVE() const { return HasXSAVE; } + bool hasXSAVEOPT() const { return HasXSAVEOPT; } + bool hasXSAVEC() const { return HasXSAVEC; } + bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } - bool hasFMA() const { return HasFMA; } - // FIXME: Favor FMA when both are enabled. Is this the right thing to do? - bool hasFMA4() const { return HasFMA4 && !HasFMA; } + // Prefer FMA4 to FMA - its better for commutation/memory folding and + // has equal or better performance on all supported targets. + bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA4() const { return HasFMA4; } + bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } @@ -355,9 +380,10 @@ public: bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } - bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } @@ -375,6 +401,7 @@ public: bool hasDQI() const { return HasDQI; } bool hasBWI() const { return HasBWI; } bool hasVLX() const { return HasVLX; } + bool hasPKU() const { return HasPKU; } bool hasMPX() const { return HasMPX; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -394,9 +421,11 @@ public: bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } + bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } bool isTargetWindowsMSVC() const { return TargetTriple.isWindowsMSVCEnvironment(); @@ -406,6 +435,10 @@ public: return TargetTriple.isKnownWindowsMSVCEnvironment(); } + bool isTargetWindowsCoreCLR() const { + return TargetTriple.isWindowsCoreCLREnvironment(); + } + bool isTargetWindowsCygwin() const { return TargetTriple.isWindowsCygwinEnvironment(); } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index fb9cb4ba4c86..0e7e4c0c84a9 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -28,10 +28,17 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +namespace llvm { +void initializeWinEHStatePassPass(PassRegistry &); +} + extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target); RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeWinEHStatePassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -45,7 +52,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSBinFormatELF()) return make_unique<X86ELFTargetObjectFile>(); - if (TT.isKnownWindowsMSVCEnvironment()) + if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment()) return make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) return make_unique<TargetLoweringObjectFileCOFF>(); @@ -175,8 +182,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, //===----------------------------------------------------------------------===// TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(X86TTIImpl(this, F)); + }); } @@ -246,6 +254,9 @@ bool X86PassConfig::addPreISel() { } void X86PassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createX86OptimizeLEAs()); + addPass(createX86CallFrameOptimization()); } diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 6f900ea351ef..782768d0ab16 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" @@ -152,9 +153,8 @@ static std::string scalarConstantToHexString(const Constant *C) { } } -MCSection * -X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *X86WindowsTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isMergeableConst() && C) { const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | @@ -171,5 +171,5 @@ X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, COFF::IMAGE_COMDAT_SELECT_ANY); } - return TargetLoweringObjectFile::getSectionForConstant(Kind, C); + return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C); } diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 66366b2373cd..6b2448cc9de6 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -58,7 +58,7 @@ namespace llvm { /// \brief Given a mergeable constant with the specified size and relocation /// information, return a section that it should be placed in. - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 7df726091843..2e7bbb208743 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" + using namespace llvm; #define DEBUG_TYPE "x86tti" @@ -62,8 +63,8 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (ST->is64Bit()) return 64; - return 32; + return 32; } unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { @@ -84,12 +85,12 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -unsigned X86TTIImpl::getArithmeticInstrCost( +int X86TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -101,10 +102,9 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence SRA + SRL + ADD + SRA. // The OperandValue properties many not be same as that of previous // operation;conservatively assume OP_None. - unsigned Cost = - 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -115,8 +115,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost( return Cost; } - static const CostTblEntry<MVT::SimpleValueType> - AVX2UniformConstCostTable[] = { + static const CostTblEntry AVX2UniformConstCostTable[] = { { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -127,12 +126,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasAVX2()) { - int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX2UniformConstCostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { + static const CostTblEntry AVX512CostTable[] = { { ISD::SHL, MVT::v16i32, 1 }, { ISD::SRL, MVT::v16i32, 1 }, { ISD::SRA, MVT::v16i32, 1 }, @@ -141,7 +140,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v8i64, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { + if (ST->hasAVX512()) { + if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. { ISD::SHL, MVT::v4i32, 1 }, @@ -154,7 +158,57 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v2i64, 1 }, { ISD::SHL, MVT::v4i64, 1 }, { ISD::SRL, MVT::v4i64, 1 }, + }; + + // Look for AVX2 lowering tricks. + if (ST->hasAVX2()) { + if (ISD == ISD::SHL && LT.second == MVT::v16i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX2, a packed v16i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return LT.first; + + if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry XOPCostTable[] = { + // 128bit shifts take 1cy, but right shifts require negation beforehand. + { ISD::SHL, MVT::v16i8, 1 }, + { ISD::SRL, MVT::v16i8, 2 }, + { ISD::SRA, MVT::v16i8, 2 }, + { ISD::SHL, MVT::v8i16, 1 }, + { ISD::SRL, MVT::v8i16, 2 }, + { ISD::SRA, MVT::v8i16, 2 }, + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 2 }, + { ISD::SRA, MVT::v4i32, 2 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 2 }, + { ISD::SRA, MVT::v2i64, 2 }, + // 256bit shifts require splitting if AVX2 didn't catch them above. + { ISD::SHL, MVT::v32i8, 2 }, + { ISD::SRL, MVT::v32i8, 4 }, + { ISD::SRA, MVT::v32i8, 4 }, + { ISD::SHL, MVT::v16i16, 2 }, + { ISD::SRL, MVT::v16i16, 4 }, + { ISD::SRA, MVT::v16i16, 4 }, + { ISD::SHL, MVT::v8i32, 2 }, + { ISD::SRL, MVT::v8i32, 4 }, + { ISD::SRA, MVT::v8i32, 4 }, + { ISD::SHL, MVT::v4i64, 2 }, + { ISD::SRL, MVT::v4i64, 4 }, + { ISD::SRA, MVT::v4i64, 4 }, + }; + // Look for XOP lowering tricks. + if (ST->hasXOP()) { + if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CustomCostTable[] = { { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. @@ -163,7 +217,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v32i8, 32*20 }, @@ -176,44 +231,44 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::UDIV, MVT::v4i64, 4*20 }, }; - if (ST->hasAVX512()) { - int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX512CostTable[Idx].Cost; - } - // Look for AVX2 lowering tricks. + // Look for AVX2 lowering tricks for custom cases. if (ST->hasAVX2()) { - if (ISD == ISD::SHL && LT.second == MVT::v16i16 && - (Op2Info == TargetTransformInfo::OK_UniformConstantValue || - Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) - // On AVX2, a packed v16i16 shift left by a constant build_vector - // is lowered into a vector multiply (vpmullw). - return LT.first; - - int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX2CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> + static const CostTblEntry SSE2UniformConstCostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // Constant splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v32i8, 2 }, // psllw. { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v16i16, 2 }, // psllw. { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v8i32, 2 }, // pslld { ISD::SHL, MVT::v2i64, 1 }, // psllq. + { ISD::SHL, MVT::v4i64, 2 }, // psllq. { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v32i8, 2 }, // psrlw. { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v16i16, 2 }, // psrlw. { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v8i32, 2 }, // psrld. { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + { ISD::SRL, MVT::v4i64, 2 }, // psrlq. { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v16i16, 2 }, // psraw. { ISD::SRA, MVT::v4i32, 1 }, // psrad. + { ISD::SRA, MVT::v8i32, 2 }, // psrad. { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence @@ -227,27 +282,34 @@ unsigned X86TTIImpl::getArithmeticInstrCost( if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; - int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * SSE2UniformConstCostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } if (ISD == ISD::SHL && Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { - EVT VT = LT.second; + MVT VT = LT.second; + // Vector shift left by non uniform constant can be lowered + // into vector multiply (pmullw/pmulld). if ((VT == MVT::v8i16 && ST->hasSSE2()) || (VT == MVT::v4i32 && ST->hasSSE41())) - // Vector shift left by non uniform constant can be lowered - // into vector multiply (pmullw/pmulld). return LT.first; + + // v16i16 and v8i32 shifts by non-uniform constants are lowered into a + // sequence of extract + two vector multiply + insert. + if ((VT == MVT::v8i32 || VT == MVT::v16i16) && + (ST->hasAVX() && !ST->hasAVX2())) + ISD = ISD::MUL; + + // A vector shift left by non uniform constant is converted + // into a vector multiply; the new multiply is eventually + // lowered into a sequence of shuffles and 2 x pmuludq. if (VT == MVT::v4i32 && ST->hasSSE2()) - // A vector shift left by non uniform constant is converted - // into a vector multiply; the new multiply is eventually - // lowered into a sequence of shuffles and 2 x pmuludq. ISD = ISD::MUL; } - static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { + static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // For some cases, where the shift amount is a scalar we would be able @@ -257,20 +319,31 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // used for vectorization and we don't want to make vectorized code worse // than scalar code. { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. - { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular @@ -289,12 +362,11 @@ unsigned X86TTIImpl::getArithmeticInstrCost( }; if (ST->hasSSE2()) { - int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * SSE2CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { + static const CostTblEntry AVX1CostTable[] = { // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. @@ -314,29 +386,21 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // Look for AVX1 lowering tricks. if (ST->hasAVX() && !ST->hasAVX2()) { - EVT VT = LT.second; + MVT VT = LT.second; - // v16i16 and v8i32 shifts by non-uniform constants are lowered into a - // sequence of extract + two vector multiply + insert. - if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && - Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) - ISD = ISD::MUL; - - int Idx = CostTableLookup(AVX1CostTable, ISD, VT); - if (Idx != -1) - return LT.first * AVX1CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT)) + return LT.first * Entry->Cost; } // Custom lowering of vectors. - static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { + static const CostTblEntry CustomLowered[] = { // A v2i64/v4i64 and multiply is custom lowered as a series of long // multiplies(3), shifts(4) and adds(2). { ISD::MUL, MVT::v2i64, 9 }, { ISD::MUL, MVT::v4i64, 9 }, }; - int Idx = CostTableLookup(CustomLowered, ISD, LT.second); - if (Idx != -1) - return LT.first * CustomLowered[Idx].Cost; + if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) + return LT.first * Entry->Cost; // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, // 2x pmuludq, 2x shuffle. @@ -348,15 +412,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost( return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } -unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only estimate the cost of reverse and alternate shuffles. if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - unsigned Cost = 1; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + int Cost = 1; if (LT.second.getSizeInBits() > 128) Cost = 3; // Extract + insert + copy. @@ -367,14 +431,14 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // The backend knows how to generate a single VEX.256 version of // instruction VPBLENDW if the target supports AVX2. if (ST->hasAVX2() && LT.second == MVT::v16i16) return LT.first; - static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { + static const CostTblEntry AVXAltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd @@ -390,13 +454,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} }; - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * AVXAltShuffleTbl[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { + static const CostTblEntry SSE41AltShuffleTbl[] = { // These are lowered into movsd. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, @@ -414,13 +477,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} }; - if (ST->hasSSE41()) { - int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSE41AltShuffleTbl[Idx].Cost; - } + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { + static const CostTblEntry SSSE3AltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd @@ -433,13 +495,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or }; - if (ST->hasSSSE3()) { - int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSSE3AltShuffleTbl[Idx].Cost; - } + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { + static const CostTblEntry SSEAltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd @@ -454,65 +515,47 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, }; // Fall-back (SSE3 and SSE2). - int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSEAltShuffleTbl[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); - - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - SSE2ConvTbl[] = { - // These are somewhat magic numbers justified by looking at the output of - // Intel's IACA, running some kernels and making sure when we take - // legalization into account the throughput will be overestimated. - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - // There are faster sequences for float conversions. - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + // FIXME: Need a better design of the cost table to handle non-simple types of + // potential massive combinations (elem_num x src_type x dst_type). + + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, }; - if (ST->hasSSE2() && !ST->hasAVX()) { - int Idx = - ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); - if (Idx != -1) - return LTSrc.first * SSE2ConvTbl[Idx].Cost; - } - - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVX512ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, - { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, - { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, // v16i1 -> v16i32 - load + broadcast { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, @@ -522,33 +565,49 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, - }; - if (ST->hasAVX512()) { - int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, - LTSrc.second); - if (Idx != -1) - return AVX512ConversionTbl[Idx].Cost; - } - EVT SrcTy = TLI->getValueType(DL, Src); - EVT DstTy = TLI->getValueType(DL, Dst); - - // The function getSimpleVT only handles simple value types. - if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, + + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, + }; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVX2ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, @@ -579,8 +638,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVXConversionTbl[] = { + static const TypeConversionCostTblEntry AVXConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, @@ -650,34 +708,158 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, }; + static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + }; + + static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { + // These are somewhat magic numbers justified by looking at the output of + // Intel's IACA, running some kernels and making sure when we take + // legalization into account the throughput will be overestimated. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + // There are faster sequences for float conversions. + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, + }; + + std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); + + if (ST->hasSSE2() && !ST->hasAVX()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + LTDest.second, LTSrc.second)) + return LTSrc.first * Entry->Cost; + } + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + // The function getSimpleVT only handles simple value types. + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + if (ST->hasAVX2()) { - int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return AVX2ConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } if (ST->hasAVX()) { - int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return AVXConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { + static const CostTblEntry SSE42CostTbl[] = { { ISD::SETCC, MVT::v2f64, 1 }, { ISD::SETCC, MVT::v4f32, 1 }, { ISD::SETCC, MVT::v2i64, 1 }, @@ -686,7 +868,7 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i8, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { + static const CostTblEntry AVX1CostTbl[] = { { ISD::SETCC, MVT::v4f64, 1 }, { ISD::SETCC, MVT::v8f32, 1 }, // AVX1 does not support 8-wide integer compare. @@ -696,54 +878,45 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v32i8, 4 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { + static const CostTblEntry AVX2CostTbl[] = { { ISD::SETCC, MVT::v4i64, 1 }, { ISD::SETCC, MVT::v8i32, 1 }, { ISD::SETCC, MVT::v16i16, 1 }, { ISD::SETCC, MVT::v32i8, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { + static const CostTblEntry AVX512CostTbl[] = { { ISD::SETCC, MVT::v8i64, 1 }, { ISD::SETCC, MVT::v16i32, 1 }, { ISD::SETCC, MVT::v8f64, 1 }, { ISD::SETCC, MVT::v16f32, 1 }, }; - if (ST->hasAVX512()) { - int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX512CostTbl[Idx].Cost; - } + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX2()) { - int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX2CostTbl[Idx].Cost; - } + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTbl[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTbl[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -761,10 +934,9 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, - bool Extract) { +int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { assert (Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; + int Cost = 0; for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { if (Insert) @@ -776,9 +948,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, return Cost; } -unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { +int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -796,22 +967,21 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { - unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), - Alignment, AddressSpace); - unsigned SplitCost = getScalarizationOverhead(Src, - Opcode == Instruction::Load, - Opcode==Instruction::Store); + int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, + AddressSpace); + int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, + Opcode == Instruction::Store); return NumElem * Cost + SplitCost; } } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); // Each load/store unit costs 1. - unsigned Cost = LT.first * 1; + int Cost = LT.first * 1; // On Sandybridge 256bit load/stores are double pumped // (but not on Haswell). @@ -821,9 +991,9 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return Cost; } -unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, - unsigned Alignment, - unsigned AddressSpace) { +int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask @@ -832,34 +1002,33 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); - if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || - (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { // Scalarization - unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); - unsigned ScalarCompareCost = - getCmpSelInstrCost(Instruction::ICmp, - Type::getInt8Ty(getGlobalContext()), NULL); - unsigned BranchCost = getCFInstrCost(Instruction::Br); - unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - - unsigned ValueSplitCost = - getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, - Opcode == Instruction::Store); - unsigned MemopCost = + int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = getCmpSelInstrCost( + Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + int ValueSplitCost = getScalarizationOverhead( + SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); + int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); - unsigned Cost = 0; - if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() && + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + auto VT = TLI->getValueType(DL, SrcVTy); + int Cost = 0; + if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + - getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); + Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), @@ -874,7 +1043,7 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost+LT.first; } -unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -887,10 +1056,10 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return BaseT::getAddressComputationCost(Ty, IsComplex); } -unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) { +int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -900,7 +1069,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { + static const CostTblEntry SSE42CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". @@ -908,7 +1077,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 5 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { + static const CostTblEntry AVX1CostTblPairWise[] = { { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, @@ -919,7 +1088,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i32, 5 }, }; - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE42CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". @@ -927,7 +1096,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { + static const CostTblEntry AVX1CostTblNoPairWise[] = { { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, { ISD::FADD, MVT::v8f32, 4 }, @@ -939,29 +1108,21 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, }; if (IsPairwise) { - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTblPairWise[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTblPairWise[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } else { - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTblNoPairWise[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTblNoPairWise[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); @@ -970,7 +1131,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned X86TTIImpl::getIntImmCost(int64_t Val) { +int X86TTIImpl::getIntImmCost(int64_t Val) { if (Val == 0) return TTI::TCC_Free; @@ -980,7 +1141,7 @@ unsigned X86TTIImpl::getIntImmCost(int64_t Val) { return 2 * TTI::TCC_Basic; } -unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1004,18 +1165,18 @@ unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - unsigned Cost = 0; + int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); + return std::max(1, Cost); } -unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1038,6 +1199,26 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::Store: ImmIdx = 0; break; + case Instruction::ICmp: + // This is an imperfect hack to prevent constant hoisting of + // compares that might be trying to check if a 64-bit value fits in + // 32-bits. The backend can optimize these cases using a right shift by 32. + // Ideally we would check the compare predicate here. There also other + // similar immediates the backend can use shifts for. + if (Idx == 1 && Imm.getBitWidth() == 64) { + uint64_t ImmVal = Imm.getZExtValue(); + if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) + return TTI::TCC_Free; + } + ImmIdx = 1; + break; + case Instruction::And: + // We support 64-bit ANDs with immediates with 32-bits of leading zeroes + // by using a 32-bit operation with implicit zero extension. Detect such + // immediates here as the normal path expects bit 31 to be sign extended. + if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) + return TTI::TCC_Free; + // Fallthrough case Instruction::Add: case Instruction::Sub: case Instruction::Mul: @@ -1045,10 +1226,8 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::ICmp: ImmIdx = 1; break; // Always return TCC_Free for the shift value of a shift instruction. @@ -1073,18 +1252,18 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, } if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } return X86TTIImpl::getIntImmCost(Imm, Ty); } -unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1118,23 +1297,181 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return X86TTIImpl::getIntImmCost(Imm, Ty); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { - int DataWidth = DataTy->getPrimitiveSizeInBits(); +// Return an average cost of Gather / Scatter instruction, maybe improved later +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace) { + + assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); + unsigned VF = SrcVTy->getVectorNumElements(); + + // Try to reduce index size from 64 bit (default for GEP) + // to 32. It is essential for VF 16. If the index can't be reduced to 32, the + // operation will use 16 x 64 indices which do not fit in a zmm and needs + // to split. Also check that the base pointer is the same for all lanes, + // and that there's at most one variable index. + auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + unsigned IndexSize = DL.getPointerSizeInBits(); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (IndexSize < 64 || !GEP) + return IndexSize; + + unsigned NumOfVarIndices = 0; + Value *Ptrs = GEP->getPointerOperand(); + if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) + return IndexSize; + for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { + if (isa<Constant>(GEP->getOperand(i))) + continue; + Type *IndxTy = GEP->getOperand(i)->getType(); + if (IndxTy->isVectorTy()) + IndxTy = IndxTy->getVectorElementType(); + if ((IndxTy->getPrimitiveSizeInBits() == 64 && + !isa<SExtInst>(GEP->getOperand(i))) || + ++NumOfVarIndices > 1) + return IndexSize; // 64 + } + return (unsigned)32; + }; + + + // Trying to reduce IndexSize to 32 bits for vector 16. + // By default the IndexSize is equal to pointer size. + unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + DL.getPointerSizeInBits(); + + Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(), + IndexSize), VF); + std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); + int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + if (SplitFactor > 1) { + // Handle splitting of vector of pointers + Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, + AddressSpace); + } + + // The gather / scatter cost is given by Intel architects. It is a rough + // number since we are looking at one instruction in a time. + const int GSOverhead = 2; + return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); +} + +/// Return the cost of full scalarization of gather / scatter operation. +/// +/// Opcode - Load or Store instruction. +/// SrcVTy - The type of the data vector that should be gathered or scattered. +/// VariableMask - The mask is non-constant at compile time. +/// Alignment - Alignment for one element. +/// AddressSpace - pointer[s] address space. +/// +int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool VariableMask, unsigned Alignment, + unsigned AddressSpace) { + unsigned VF = SrcVTy->getVectorNumElements(); + + int MaskUnpackCost = 0; + if (VariableMask) { + VectorType *MaskTy = + VectorType::get(Type::getInt1Ty(getGlobalContext()), VF); + MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()), + nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); + } - // Todo: AVX512 allows gather/scatter, works with strided and random as well - if ((DataWidth < 32) || (Consecutive == 0)) + // The cost of the scalar loads/stores. + int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + + int InsertExtractCost = 0; + if (Opcode == Instruction::Load) + for (unsigned i = 0; i < VF; ++i) + // Add the cost of inserting each scalar load into the vector + InsertExtractCost += + getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); + else + for (unsigned i = 0; i < VF; ++i) + // Add the cost of extracting each element out of the data vector + InsertExtractCost += + getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); + + return MemoryOpCost + MaskUnpackCost + InsertExtractCost; +} + +/// Calculate the cost of Gather / Scatter operation +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) { + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + unsigned VF = SrcVTy->getVectorNumElements(); + PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); + if (!PtrTy && Ptr->getType()->isVectorTy()) + PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); + assert(PtrTy && "Unexpected type for Ptr argument"); + unsigned AddressSpace = PtrTy->getAddressSpace(); + + bool Scalarize = false; + if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + Scalarize = true; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is + // better in the VariableMask case. + if (VF == 2 || (VF == 4 && !ST->hasVLX())) + Scalarize = true; + + if (Scalarize) + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + + return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); +} + +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + return (DataWidth >= 32 && ST->hasAVX2()); +} + +bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { + return isLegalMaskedLoad(DataType); +} + +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // This function is called now in two cases: from the Loop Vectorizer + // and from the Scalarizer. + // When the Loop Vectorizer asks about legality of the feature, + // the vectorization factor is not calculated yet. The Loop Vectorizer + // sends a scalar type and the decision is based on the width of the + // scalar element. + // Later on, the cost model will estimate usage this intrinsic based on + // the vector type. + // The Scalarizer asks again about legality. It sends a vector type. + // In this case we can reject non-power-of-2 vectors. + if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) return false; - if (ST->hasAVX512() || ST->hasAVX2()) - return true; - return false; + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + // AVX-512 allows gather and scatter + return DataWidth >= 32 && ST->hasAVX512(); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { - return isLegalMaskedLoad(DataType, Consecutive); +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + return isLegalMaskedGather(DataType); } -bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller, - const Function *Callee) const { +bool X86TTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); // Work this as a subsetting of subtarget features. diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index da3f36c2e27e..adb745e912d1 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -33,13 +33,13 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { const X86Subtarget *ST; const X86TargetLowering *TLI; - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } public: - explicit X86TTIImpl(const X86TargetMachine *TM, Function &F) + explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -62,38 +62,44 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - - unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex); - - unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); - - unsigned getIntImmCost(int64_t); - - unsigned getIntImmCost(const APInt &Imm, Type *Ty); - - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); - bool isLegalMaskedLoad(Type *DataType, int Consecutive); - bool isLegalMaskedStore(Type *DataType, int Consecutive); - bool hasCompatibleFunctionAttributes(const Function *Caller, - const Function *Callee) const; + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment); + int getAddressComputationCost(Type *PtrTy, bool IsComplex); + + int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + int getIntImmCost(int64_t); + + int getIntImmCost(const APInt &Imm, Type *Ty); + + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + bool isLegalMaskedLoad(Type *DataType); + bool isLegalMaskedStore(Type *DataType); + bool isLegalMaskedGather(Type *DataType); + bool isLegalMaskedScatter(Type *DataType); + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; +private: + int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, + unsigned Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace); /// @} }; diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 9190d0be9e4d..dce94a9e9ef7 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -15,7 +15,8 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" @@ -38,12 +39,16 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "winehstate" +namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); } + namespace { class WinEHStatePass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHStatePass() : FunctionPass(ID) {} + WinEHStatePass() : FunctionPass(ID) { + initializeWinEHStatePassPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &Fn) override; @@ -62,18 +67,13 @@ private: void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler); void unlinkExceptionRegistration(IRBuilder<> &Builder); - void addCXXStateStores(Function &F, MachineModuleInfo &MMI); - void addSEHStateStores(Function &F, MachineModuleInfo &MMI); - void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo, - Function &F, int BaseState); + void addStateStores(Function &F, WinEHFuncInfo &FuncInfo); void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State); Value *emitEHLSDA(IRBuilder<> &Builder, Function *F); Function *generateLSDAInEAXThunk(Function *ParentFunc); - int escapeRegNode(Function &F); - // Module-level type getters. Type *getEHLinkRegistrationType(); Type *getSEHRegistrationType(); @@ -111,6 +111,9 @@ FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } char WinEHStatePass::ID = 0; +INITIALIZE_PASS(WinEHStatePass, "x86-winehstate", + "Insert stores for EH state numbers", false, false) + bool WinEHStatePass::doInitialization(Module &M) { TheModule = &M; FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape); @@ -138,14 +141,7 @@ void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const { } bool WinEHStatePass::runOnFunction(Function &F) { - // If this is an outlined handler, don't do anything. We'll do state insertion - // for it in the parent. - StringRef WinEHParentName = - F.getFnAttribute("wineh-parent").getValueAsString(); - if (WinEHParentName != F.getName() && !WinEHParentName.empty()) - return false; - - // Check the personality. Do nothing if this is not an MSVC personality. + // Check the personality. Do nothing if this personality doesn't use funclets. if (!F.hasPersonalityFn()) return false; PersonalityFn = @@ -153,7 +149,19 @@ bool WinEHStatePass::runOnFunction(Function &F) { if (!PersonalityFn) return false; Personality = classifyEHPersonality(PersonalityFn); - if (!isMSVCEHPersonality(Personality)) + if (!isFuncletEHPersonality(Personality)) + return false; + + // Skip this function if there are no EH pads and we aren't using IR-level + // outlining. + bool HasPads = false; + for (BasicBlock &BB : F) { + if (BB.isEHPad()) { + HasPads = true; + break; + } + } + if (!HasPads) return false; // Disable frame pointer elimination in this function. @@ -163,14 +171,13 @@ bool WinEHStatePass::runOnFunction(Function &F) { emitExceptionRegistrationRecord(&F); - auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>(); - assert(MMIPtr && "MachineModuleInfo should always be available"); - MachineModuleInfo &MMI = *MMIPtr; - switch (Personality) { - default: llvm_unreachable("unexpected personality function"); - case EHPersonality::MSVC_CXX: addCXXStateStores(F, MMI); break; - case EHPersonality::MSVC_X86SEH: addSEHStateStores(F, MMI); break; - } + // The state numbers calculated here in IR must agree with what we calculate + // later on for the MachineFunction. In particular, if an IR pass deletes an + // unreachable EH pad after this point before machine CFG construction, we + // will be in trouble. If this assumption is ever broken, we should turn the + // numbers into an immutable analysis pass. + WinEHFuncInfo FuncInfo; + addStateStores(F, FuncInfo); // Reset per-function state. PersonalityFn = nullptr; @@ -261,7 +268,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); // TryLevel = -1 StateFieldIndex = 2; - insertStateNumberStore(RegNode, Builder.GetInsertPoint(), -1); + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1); // Handler = __ehhandler$F Function *Trampoline = generateLSDAInEAXThunk(F); Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1); @@ -278,7 +285,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); // TryLevel = -2 / -1 StateFieldIndex = 4; - insertStateNumberStore(RegNode, Builder.GetInsertPoint(), + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), UseStackGuard ? -2 : -1); // ScopeTable = llvm.x86.seh.lsda(F) Value *FI8 = Builder.CreateBitCast(F, Int8PtrType); @@ -347,7 +354,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { Value *CastPersonality = Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); auto AI = Trampoline->arg_begin(); - Value *Args[5] = {LSDA, AI++, AI++, AI++, AI++}; + Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++}; CallInst *Call = Builder.CreateCall(CastPersonality, Args); // Can't use musttail due to prototype mismatch, but we can use tail. Call->setTailCall(true); @@ -391,160 +398,53 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { Builder.CreateStore(Next, FSZero); } -void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); - calculateWinCXXEHStateNumbers(&F, FuncInfo); - - // The base state for the parent is -1. - addCXXStateStoresToFunclet(RegNode, FuncInfo, F, -1); - - // Set up RegNodeEscapeIndex - int RegNodeEscapeIndex = escapeRegNode(F); - FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; - - // Only insert stores in catch handlers. - Constant *FI8 = - ConstantExpr::getBitCast(&F, Type::getInt8PtrTy(TheModule->getContext())); - for (auto P : FuncInfo.HandlerBaseState) { - Function *Handler = const_cast<Function *>(P.first); - int BaseState = P.second; - IRBuilder<> Builder(&Handler->getEntryBlock(), - Handler->getEntryBlock().begin()); - // FIXME: Find and reuse such a call if present. - Value *ParentFP = Builder.CreateCall(FrameAddress, {Builder.getInt32(1)}); - Value *RecoveredRegNode = Builder.CreateCall( - FrameRecover, {FI8, ParentFP, Builder.getInt32(RegNodeEscapeIndex)}); - RecoveredRegNode = - Builder.CreateBitCast(RecoveredRegNode, RegNodeTy->getPointerTo(0)); - addCXXStateStoresToFunclet(RecoveredRegNode, FuncInfo, *Handler, BaseState); - } -} - -/// Escape RegNode so that we can access it from child handlers. Find the call -/// to localescape, if any, in the entry block and append RegNode to the list -/// of arguments. -int WinEHStatePass::escapeRegNode(Function &F) { - // Find the call to localescape and extract its arguments. - IntrinsicInst *EscapeCall = nullptr; - for (Instruction &I : F.getEntryBlock()) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::localescape) { - EscapeCall = II; - break; - } - } - SmallVector<Value *, 8> Args; - if (EscapeCall) { - auto Ops = EscapeCall->arg_operands(); - Args.append(Ops.begin(), Ops.end()); - } - Args.push_back(RegNode); - - // Replace the call (if it exists) with new one. Otherwise, insert at the end - // of the entry block. - Instruction *InsertPt = EscapeCall; - if (!EscapeCall) - InsertPt = F.getEntryBlock().getTerminator(); - IRBuilder<> Builder(&F.getEntryBlock(), InsertPt); - Builder.CreateCall(FrameEscape, Args); - if (EscapeCall) - EscapeCall->eraseFromParent(); - return Args.size() - 1; -} +void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { + // Mark the registration node. The backend needs to know which alloca it is so + // that it can recover the original frame pointer. + IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator())); + Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy()); + Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode), + {RegNodeI8}); + + // Calculate state numbers. + if (isAsynchronousEHPersonality(Personality)) + calculateSEHStateNumbers(&F, FuncInfo); + else + calculateWinCXXEHStateNumbers(&F, FuncInfo); -void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode, - WinEHFuncInfo &FuncInfo, - Function &F, int BaseState) { // Iterate all the instructions and emit state number stores. + DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F); for (BasicBlock &BB : F) { + // Figure out what state we should assign calls in this block. + int BaseState = -1; + auto &BBColors = BlockColors[&BB]; + + assert(BBColors.size() == 1 && + "multi-color BB not removed by preparation"); + BasicBlock *FuncletEntryBB = BBColors.front(); + if (auto *FuncletPad = + dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) { + auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad); + if (BaseStateI != FuncInfo.FuncletBaseStateMap.end()) + BaseState = BaseStateI->second; + } + for (Instruction &I : BB) { if (auto *CI = dyn_cast<CallInst>(&I)) { // Possibly throwing call instructions have no actions to take after // an unwind. Ensure they are in the -1 state. if (CI->doesNotThrow()) continue; - insertStateNumberStore(ParentRegNode, CI, BaseState); + insertStateNumberStore(RegNode, CI, BaseState); } else if (auto *II = dyn_cast<InvokeInst>(&I)) { // Look up the state number of the landingpad this unwinds to. - LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); - // FIXME: Why does this assertion fail? - //assert(FuncInfo.LandingPadStateMap.count(LPI) && "LP has no state!"); - int State = FuncInfo.LandingPadStateMap[LPI]; - insertStateNumberStore(ParentRegNode, II, State); - } - } - } -} - -/// Assign every distinct landingpad a unique state number for SEH. Unlike C++ -/// EH, we can use this very simple algorithm while C++ EH cannot because catch -/// handlers aren't outlined and the runtime doesn't have to figure out which -/// catch handler frame to unwind to. -/// FIXME: __finally blocks are outlined, so this approach may break down there. -void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); - - // Remember and return the index that we used. We save it in WinEHFuncInfo so - // that we can lower llvm.x86.seh.recoverfp later in filter functions without - // too much trouble. - int RegNodeEscapeIndex = escapeRegNode(F); - FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; - - // Iterate all the instructions and emit state number stores. - int CurState = 0; - SmallPtrSet<BasicBlock *, 4> ExceptBlocks; - for (BasicBlock &BB : F) { - for (auto I = BB.begin(), E = BB.end(); I != E; ++I) { - if (auto *CI = dyn_cast<CallInst>(I)) { - auto *Intrin = dyn_cast<IntrinsicInst>(CI); - if (Intrin) { - // Calls that "don't throw" are considered to be able to throw asynch - // exceptions, but intrinsics cannot. - continue; - } - insertStateNumberStore(RegNode, CI, -1); - } else if (auto *II = dyn_cast<InvokeInst>(I)) { - // Look up the state number of the landingpad this unwinds to. - LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); - auto InsertionPair = - FuncInfo.LandingPadStateMap.insert(std::make_pair(LPI, CurState)); - auto Iter = InsertionPair.first; - int &State = Iter->second; - bool Inserted = InsertionPair.second; - if (Inserted) { - // Each action consumes a state number. - auto *EHActions = cast<IntrinsicInst>(LPI->getNextNode()); - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - parseEHActions(EHActions, ActionList); - assert(!ActionList.empty()); - CurState += ActionList.size(); - State += ActionList.size() - 1; - - // Remember all the __except block targets. - for (auto &Handler : ActionList) { - if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) { - auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc()); -#ifndef NDEBUG - for (BasicBlock *Pred : predecessors(BA->getBasicBlock())) - assert(Pred->isLandingPad() && - "WinEHPrepare failed to split block"); -#endif - ExceptBlocks.insert(BA->getBasicBlock()); - } - } - } + assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!"); + int State = FuncInfo.InvokeStateMap[II]; insertStateNumberStore(RegNode, II, State); } } } - - // Insert llvm.x86.seh.restoreframe() into each __except block. - Function *RestoreFrame = - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_restoreframe); - for (BasicBlock *ExceptBB : ExceptBlocks) { - IRBuilder<> Builder(ExceptBB->begin()); - Builder.CreateCall(RestoreFrame, {}); - } } void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode, |