diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-10-23 17:51:42 +0000 |
commit | 1d5ae1026e831016fc29fd927877c86af904481f (patch) | |
tree | 2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/X86 | |
parent | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff) |
Diffstat (limited to 'lib/Target/X86')
95 files changed, 7513 insertions, 4674 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 95cbf46d37ed..25be79ec2b1e 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -870,6 +870,14 @@ private: bool parseDirectiveFPOEndProc(SMLoc L); bool parseDirectiveFPOData(SMLoc L); + /// SEH directives. + bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo); + bool parseDirectiveSEHPushReg(SMLoc); + bool parseDirectiveSEHSetFrame(SMLoc); + bool parseDirectiveSEHSaveReg(SMLoc); + bool parseDirectiveSEHSaveXMM(SMLoc); + bool parseDirectiveSEHPushFrame(SMLoc); + unsigned checkTargetMatchPredicate(MCInst &Inst) override; bool validateInstruction(MCInst &Inst, const OperandVector &Ops); @@ -955,6 +963,8 @@ private: public: enum X86MatchResultTy { Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY, +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "X86GenAsmMatcher.inc" }; X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, @@ -3173,6 +3183,13 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, EmitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; + case Match_InvalidImmUnsignedi4: { + SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate must be an integer in range [0, 15]", + EmptyRange, MatchingInlineAsm); + } case Match_MissingFeature: return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm); case Match_InvalidOperand: @@ -3520,6 +3537,15 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, MatchingInlineAsm); } + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidImmUnsignedi4) == 1) { + SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return Error(ErrorLoc, "immediate must be an integer in range [0, 15]", + EmptyRange, MatchingInlineAsm); + } + // If all of these were an outright failure, report it in a useless way. return Error(IDLoc, "unknown instruction mnemonic", EmptyRange, MatchingInlineAsm); @@ -3572,6 +3598,16 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return parseDirectiveFPOEndPrologue(DirectiveID.getLoc()); else if (IDVal == ".cv_fpo_endproc") return parseDirectiveFPOEndProc(DirectiveID.getLoc()); + else if (IDVal == ".seh_pushreg") + return parseDirectiveSEHPushReg(DirectiveID.getLoc()); + else if (IDVal == ".seh_setframe") + return parseDirectiveSEHSetFrame(DirectiveID.getLoc()); + else if (IDVal == ".seh_savereg") + return parseDirectiveSEHSaveReg(DirectiveID.getLoc()); + else if (IDVal == ".seh_savexmm") + return parseDirectiveSEHSaveXMM(DirectiveID.getLoc()); + else if (IDVal == ".seh_pushframe") + return parseDirectiveSEHPushFrame(DirectiveID.getLoc()); return true; } @@ -3708,6 +3744,140 @@ bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) { return getTargetStreamer().emitFPOEndProc(L); } +bool X86AsmParser::parseSEHRegisterNumber(unsigned RegClassID, + unsigned &RegNo) { + SMLoc startLoc = getLexer().getLoc(); + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + + // Try parsing the argument as a register first. + if (getLexer().getTok().isNot(AsmToken::Integer)) { + SMLoc endLoc; + if (ParseRegister(RegNo, startLoc, endLoc)) + return true; + + if (!X86MCRegisterClasses[RegClassID].contains(RegNo)) { + return Error(startLoc, + "register is not supported for use with this directive"); + } + } else { + // Otherwise, an integer number matching the encoding of the desired + // register may appear. + int64_t EncodedReg; + if (getParser().parseAbsoluteExpression(EncodedReg)) + return true; + + // The SEH register number is the same as the encoding register number. Map + // from the encoding back to the LLVM register number. + RegNo = 0; + for (MCPhysReg Reg : X86MCRegisterClasses[RegClassID]) { + if (MRI->getEncodingValue(Reg) == EncodedReg) { + RegNo = Reg; + break; + } + } + if (RegNo == 0) { + return Error(startLoc, + "incorrect register number for use with this directive"); + } + } + + return false; +} + +bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) { + unsigned Reg = 0; + if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFIPushReg(Reg, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) { + unsigned Reg = 0; + int64_t Off; + if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify a stack pointer offset"); + + getParser().Lex(); + if (getParser().parseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFISetFrame(Reg, Off, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) { + unsigned Reg = 0; + int64_t Off; + if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify an offset on the stack"); + + getParser().Lex(); + if (getParser().parseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFISaveReg(Reg, Off, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) { + unsigned Reg = 0; + int64_t Off; + if (parseSEHRegisterNumber(X86::VR128XRegClassID, Reg)) + return true; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("you must specify an offset on the stack"); + + getParser().Lex(); + if (getParser().parseAbsoluteExpression(Off)) + return true; + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc); + return false; +} + +bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) { + bool Code = false; + StringRef CodeID; + if (getLexer().is(AsmToken::At)) { + SMLoc startLoc = getLexer().getLoc(); + getParser().Lex(); + if (!getParser().parseIdentifier(CodeID)) { + if (CodeID != "code") + return Error(startLoc, "expected @code"); + Code = true; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + getParser().Lex(); + getStreamer().EmitWinCFIPushFrame(Code, Loc); + return false; +} + // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target()); diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 5bc979d1f18c..e9be28ca77b0 100644 --- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -35,6 +35,10 @@ inline bool isImmUnsignedi8Value(uint64_t Value) { return isUInt<8>(Value) || isInt<8>(Value); } +inline bool isImmUnsignedi4Value(uint64_t Value) { + return isUInt<4>(Value); +} + } // End of namespace llvm #endif diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index a771ba366318..3a76d023e640 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -260,6 +260,15 @@ struct X86Operand final : public MCParsedAsmOperand { return isImmSExti64i32Value(CE->getValue()); } + bool isImmUnsignedi4() const { + if (!isImm()) return false; + // If this isn't a constant expr, reject it. The immediate byte is shared + // with a register encoding. We can't have it affected by a relocation. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) return false; + return isImmUnsignedi4Value(CE->getValue()); + } + bool isImmUnsignedi8() const { if (!isImm()) return false; // If this isn't a constant expr, just assume it fits and let relaxation @@ -491,7 +500,7 @@ struct X86Operand final : public MCParsedAsmOperand { void addGR32orGR64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - unsigned RegNo = getReg(); + MCRegister RegNo = getReg(); if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) RegNo = getX86SubSuperRegister(RegNo, 32); Inst.addOperand(MCOperand::createReg(RegNo)); @@ -572,7 +581,7 @@ struct X86Operand final : public MCParsedAsmOperand { static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) { SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size()); - auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc); + auto Res = std::make_unique<X86Operand>(Token, Loc, EndLoc); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); return Res; @@ -582,7 +591,7 @@ struct X86Operand final : public MCParsedAsmOperand { CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc, bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(), StringRef SymName = StringRef(), void *OpDecl = nullptr) { - auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Register, StartLoc, EndLoc); Res->Reg.RegNo = RegNo; Res->AddressOf = AddressOf; Res->OffsetOfLoc = OffsetOfLoc; @@ -593,19 +602,19 @@ struct X86Operand final : public MCParsedAsmOperand { static std::unique_ptr<X86Operand> CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) { - return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc); + return std::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc); } static std::unique_ptr<X86Operand> CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) { - auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Prefix, StartLoc, EndLoc); Res->Pref.Prefixes = Prefixes; return Res; } static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc) { - auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); Res->Imm.Val = Val; return Res; } @@ -615,7 +624,7 @@ struct X86Operand final : public MCParsedAsmOperand { CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), void *OpDecl = nullptr, unsigned FrontendSize = 0) { - auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; Res->Mem.BaseReg = 0; @@ -643,7 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand { // The scale should always be one of {1,2,4,8}. assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && "Invalid scale!"); - auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); + auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = SegReg; Res->Mem.Disp = Disp; Res->Mem.BaseReg = BaseReg; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index a241362a271d..e287f6625115 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -12,13 +12,14 @@ // //===----------------------------------------------------------------------===// +#include "X86DisassemblerDecoder.h" +#include "llvm/ADT/StringRef.h" + #include <cstdarg> /* for va_*() */ #include <cstdio> /* for vsnprintf() */ #include <cstdlib> /* for exit() */ #include <cstring> /* for memset() */ -#include "X86DisassemblerDecoder.h" - using namespace llvm::X86Disassembler; /// Specifies whether a ModR/M byte is needed and (if so) which diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 54413fa1a02f..f08fcb575bf0 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -287,7 +287,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const { // Relax if the value is too big for a (signed) i8. - return int64_t(Value) != int64_t(int8_t(Value)); + return !isInt<8>(Value); } // FIXME: Can tblgen help at all here to verify there aren't other instructions @@ -557,7 +557,7 @@ protected: // If the frame pointer is other than esp/rsp, we do not have a way to // generate a compact unwinding representation, so bail out. - if (MRI.getLLVMRegNum(Inst.getRegister(), true) != + if (*MRI.getLLVMRegNum(Inst.getRegister(), true) != (Is64Bit ? X86::RBP : X86::EBP)) return 0; @@ -605,7 +605,7 @@ protected: // unwind encoding. return CU::UNWIND_MODE_DWARF; - unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true); SavedRegs[SavedRegIdx++] = Reg; StackAdjust += OffsetSize; InstrOffset += PushInstrSize(Reg); diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 232a06593238..bd009da60851 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -46,10 +46,10 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 }; -static X86_64RelType getType64(unsigned Kind, +static X86_64RelType getType64(MCFixupKind Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel) { - switch (Kind) { + switch (unsigned(Kind)) { default: llvm_unreachable("Unimplemented"); case FK_NONE: @@ -97,7 +97,7 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) { static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, MCSymbolRefExpr::VariantKind Modifier, X86_64RelType Type, bool IsPCRel, - unsigned Kind) { + MCFixupKind Kind) { switch (Modifier) { default: llvm_unreachable("Unimplemented"); @@ -202,7 +202,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, // and we want to keep back-compatibility. if (!Ctx.getAsmInfo()->canRelaxRelocations()) return ELF::R_X86_64_GOTPCREL; - switch (Kind) { + switch (unsigned(Kind)) { default: return ELF::R_X86_64_GOTPCREL; case X86::reloc_riprel_4byte_relax: @@ -237,7 +237,7 @@ static X86_32RelType getType32(X86_64RelType T) { static unsigned getRelocType32(MCContext &Ctx, MCSymbolRefExpr::VariantKind Modifier, X86_32RelType Type, bool IsPCRel, - unsigned Kind) { + MCFixupKind Kind) { switch (Modifier) { default: llvm_unreachable("Unimplemented"); @@ -265,8 +265,9 @@ static unsigned getRelocType32(MCContext &Ctx, if (!Ctx.getAsmInfo()->canRelaxRelocations()) return ELF::R_386_GOT32; - return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X - : ELF::R_386_GOT32; + return Kind == MCFixupKind(X86::reloc_signed_4byte_relax) + ? ELF::R_386_GOT32X + : ELF::R_386_GOT32; case MCSymbolRefExpr::VK_GOTOFF: assert(Type == RT32_32); assert(!IsPCRel); @@ -317,7 +318,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); - unsigned Kind = Fixup.getKind(); + MCFixupKind Kind = Fixup.getKind(); X86_64RelType Type = getType64(Kind, Modifier, IsPCRel); if (getEMachine() == ELF::EM_X86_64) return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind); @@ -329,5 +330,5 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, std::unique_ptr<MCObjectTargetWriter> llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) { - return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine); + return std::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index e1125c176b25..d986c829d98e 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -163,5 +163,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { TextAlignFillValue = 0x90; + AllowAtInName = true; + UseIntegratedAssembler = true; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 31d26d08a63f..ac36bf3a12fa 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -862,6 +862,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, VEX_B = ~(BaseRegEnc >> 3) & 1; unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg); VEX_X = ~(IndexRegEnc >> 3) & 1; + if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV. + EVEX_V2 = ~(IndexRegEnc >> 4) & 1; + break; } case X86II::MRMSrcReg: { diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index ce05ad974507..ced9eacc8b97 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -70,6 +70,10 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) { return DWARFFlavour::X86_32_Generic; } +bool X86_MC::hasLockPrefix(const MCInst &MI) { + return MI.getFlags() & X86::IP_HAS_LOCK; +} + void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { // FIXME: TableGen these. for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) { @@ -399,6 +403,9 @@ public: findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents, uint64_t GotSectionVA, const Triple &TargetTriple) const override; + Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst, + uint64_t Addr, + uint64_t Size) const override; }; #define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS @@ -511,7 +518,31 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries( return findX86_64PltEntries(PltSectionVA, PltContents); default: return {}; - } + } +} + +Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress( + const MCInst &Inst, uint64_t Addr, uint64_t Size) const { + const MCInstrDesc &MCID = Info->get(Inst.getOpcode()); + int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags); + if (MemOpStart == -1) + return None; + MemOpStart += X86II::getOperandBias(MCID); + + const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg); + const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg); + const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg); + const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt); + const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp); + if (SegReg.getReg() != 0 || IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 || + !Disp.isImm()) + return None; + + // RIP-relative addressing. + if (BaseReg.getReg() == X86::RIP) + return Addr + Size + Disp.getImm(); + + return None; } } // end of namespace X86_MC @@ -567,13 +598,13 @@ extern "C" void LLVMInitializeX86TargetMC() { createX86_64AsmBackend); } -unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, - bool High) { +MCRegister llvm::getX86SubSuperRegisterOrZero(MCRegister Reg, unsigned Size, + bool High) { switch (Size) { - default: return 0; + default: return X86::NoRegister; case 8: if (High) { - switch (Reg) { + switch (Reg.id()) { default: return getX86SubSuperRegisterOrZero(Reg, 64); case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::SI; @@ -593,8 +624,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, return X86::BH; } } else { - switch (Reg) { - default: return 0; + switch (Reg.id()) { + default: return X86::NoRegister; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AL; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -630,8 +661,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, } } case 16: - switch (Reg) { - default: return 0; + switch (Reg.id()) { + default: return X86::NoRegister; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -666,8 +697,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, return X86::R15W; } case 32: - switch (Reg) { - default: return 0; + switch (Reg.id()) { + default: return X86::NoRegister; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::EAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -702,7 +733,7 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, return X86::R15D; } case 64: - switch (Reg) { + switch (Reg.id()) { default: return 0; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::RAX; @@ -740,9 +771,9 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, } } -unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) { - unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High); - assert(Res != 0 && "Unexpected register or VT"); +MCRegister llvm::getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High) { + MCRegister Res = getX86SubSuperRegisterOrZero(Reg, Size, High); + assert(Res != X86::NoRegister && "Unexpected register or VT"); return Res; } diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 00dd5908cbf5..0c789061f0e1 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H +#include "llvm/MC/MCRegister.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/DataTypes.h" #include <string> @@ -57,6 +58,10 @@ unsigned getDwarfRegFlavour(const Triple &TT, bool isEH); void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI); + +/// Returns true if this instruction has a LOCK prefix. +bool hasLockPrefix(const MCInst &MI); + /// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc. /// do not need to go through TargetRegistry. MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, @@ -111,12 +116,12 @@ createX86WinCOFFObjectWriter(bool Is64Bit); /// Returns the sub or super register of a specific X86 register. /// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. /// Aborts on error. -unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false); +MCRegister getX86SubSuperRegister(MCRegister, unsigned, bool High=false); /// Returns the sub or super register of a specific X86 register. /// Like getX86SubSuperRegister() but returns 0 on error. -unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, - bool High = false); +MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned, + bool High = false); } // End llvm namespace diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index fc7e99f61e5e..b67a7508fe72 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -276,7 +276,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( // x86_64 distinguishes movq foo@GOTPCREL so that the linker can // rewrite the movq to an leaq at link time if the symbol ends up in // the same linkage unit. - if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load) + if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load) Type = MachO::X86_64_RELOC_GOT_LOAD; else Type = MachO::X86_64_RELOC_GOT; @@ -339,8 +339,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( return; } else { Type = MachO::X86_64_RELOC_UNSIGNED; - unsigned Kind = Fixup.getKind(); - if (Kind == X86::reloc_signed_4byte) { + if (Fixup.getTargetKind() == X86::reloc_signed_4byte) { Asm.getContext().reportError( Fixup.getLoc(), "32-bit absolute addressing is not supported in 64-bit mode"); @@ -600,5 +599,5 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, std::unique_ptr<MCObjectTargetWriter> llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) { - return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype); + return std::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype); } diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 3baab9da1c41..760239f76505 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -109,5 +109,5 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, std::unique_ptr<MCObjectTargetWriter> llvm::createX86WinCOFFObjectWriter(bool Is64Bit) { - return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit); + return std::make_unique<X86WinCOFFObjectWriter>(Is64Bit); } diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 796a27a17255..db624378d517 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -35,8 +35,9 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { MCStreamer::EmitWinEHHandlerData(Loc); // We have to emit the unwind info now, because this directive - // actually switches to the .xdata section! - EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo()); + // actually switches to the .xdata section. + if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo()) + EHStreamer.EmitUnwindInfo(*this, CurFrame); } void X86WinCOFFStreamer::EmitWindowsUnwindTables() { diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index e9987d1f62bd..d5494ef12370 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -170,7 +170,7 @@ bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym, L, "opening new .cv_fpo_proc before closing previous frame"); return true; } - CurFPOData = llvm::make_unique<FPOData>(); + CurFPOData = std::make_unique<FPOData>(); CurFPOData->Function = ProcSym; CurFPOData->Begin = emitFPOLabel(); CurFPOData->ParamsSize = ParamsSize; diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index a95f68434d12..6840fc12751d 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -81,6 +81,12 @@ FunctionPass *createX86FlagsCopyLoweringPass(); /// Return a pass that expands WinAlloca pseudo-instructions. FunctionPass *createX86WinAllocaExpander(); +/// Return a pass that inserts int3 at the end of the function if it ends with a +/// CALL instruction. The pass does the same for each funclet as well. This +/// ensures that the open interval of function start and end PCs contains all +/// return addresses for the benefit of the Windows x64 unwinder. +FunctionPass *createX86AvoidTrailingCallPass(); + /// Return a pass that optimizes the code-size of x86 call sequences. This is /// done by replacing esp-relative movs with pushes. FunctionPass *createX86CallFrameOptimization(); @@ -137,13 +143,13 @@ void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); -void initializeX86ExpandPseudoPass(PassRegistry&); void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); +void initializeX86ExpandPseudoPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); +void initializeX86OptimizeLEAPassPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); - } // End llvm namespace #endif diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 3112f00c91f2..d8631aca2734 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -95,7 +95,8 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", "Support 64-bit instructions">; def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", - "64-bit with cmpxchg16b">; + "64-bit with cmpxchg16b", + [FeatureCMPXCHG8B]>; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", @@ -240,8 +241,11 @@ def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true", "Enable Cache Demote">; def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true", "Support ptwrite instruction">; -def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", - "Support MPX instructions">; +// FIXME: This feature is deprecated in 10.0 and should not be used for +// anything, but removing it would break IR files that may contain it in a +// target-feature attribute. +def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false", + "Deprecated. Support MPX instructions">; def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", @@ -374,6 +378,10 @@ def FeatureHasFastGather : SubtargetFeature<"fast-gather", "HasFastGather", "true", "Indicates if gather is reasonably fast">; +def FeaturePrefer128Bit + : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true", + "Prefer 128-bit AVX instructions">; + def FeaturePrefer256Bit : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", "Prefer 256-bit AVX instructions">; @@ -449,6 +457,10 @@ def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "Merge branches to a three-way " "conditional branch">; +// Enable use of alias analysis during code generation. +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + // Bonnell def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">; // Silvermont @@ -579,7 +591,6 @@ def ProcessorFeatures { // Skylake list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES, - FeatureMPX, FeatureXSAVEC, FeatureXSAVES, FeatureCLFLUSHOPT, @@ -594,6 +605,7 @@ def ProcessorFeatures { // Skylake-AVX512 list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512, + FeaturePrefer256Bit, FeatureCDI, FeatureDQI, FeatureBWI, @@ -627,6 +639,7 @@ def ProcessorFeatures { // Cannonlake list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512, + FeaturePrefer256Bit, FeatureCDI, FeatureDQI, FeatureBWI, @@ -665,6 +678,17 @@ def ProcessorFeatures { list<SubtargetFeature> ICXFeatures = !listconcat(ICLInheritableFeatures, ICXSpecificFeatures); + //Tigerlake + list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT, + FeatureMOVDIRI, + FeatureMOVDIR64B, + FeatureSHSTK]; + list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather]; + list<SubtargetFeature> TGLInheritableFeatures = + !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures); + list<SubtargetFeature> TGLFeatures = + !listconcat(ICLFeatures, TGLInheritableFeatures ); + // Atom list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87, FeatureCMPXCHG8B, @@ -707,7 +731,6 @@ def ProcessorFeatures { // Goldmont list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES, - FeatureMPX, FeatureSHA, FeatureRDSEED, FeatureXSAVE, @@ -786,6 +809,22 @@ def ProcessorFeatures { list<SubtargetFeature> KNMFeatures = !listconcat(KNLFeatures, [FeatureVPOPCNTDQ]); + // Barcelona + list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87, + FeatureCMPXCHG8B, + FeatureSSE4A, + Feature3DNowA, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureLZCNT, + FeaturePOPCNT, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureCMOV, + Feature64Bit, + FeatureFastScalarShiftMasks]; + list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures; // Bobcat list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87, @@ -1093,6 +1132,8 @@ def : ProcessorModel<"icelake-client", SkylakeServerModel, ProcessorFeatures.ICLFeatures>; def : ProcessorModel<"icelake-server", SkylakeServerModel, ProcessorFeatures.ICXFeatures>; +def : ProcessorModel<"tigerlake", SkylakeServerModel, + ProcessorFeatures.TGLFeatures>; // AMD CPUs. @@ -1129,10 +1170,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { } foreach P = ["amdfam10", "barcelona"] in { - def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA, - FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV, - Feature64Bit, FeatureFastScalarShiftMasks]>; + def : Proc<P, ProcessorFeatures.BarcelonaFeatures>; } // Bobcat diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 80120722e0e6..8d27be30a277 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -242,7 +242,7 @@ void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo, return PrintOperand(MI, OpNo, O); if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT) O << '%'; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) { unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : (strcmp(Modifier+6,"32") == 0) ? 32 : @@ -388,7 +388,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI, static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, char Mode, raw_ostream &O) { - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); bool EmitPercent = true; if (!X86::GR8RegClass.contains(Reg) && @@ -575,7 +575,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { // Emitting note header. int WordSize = TT.isArch64Bit() ? 8 : 4; - EmitAlignment(WordSize == 4 ? 2 : 3); + EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0" OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/); @@ -585,7 +585,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4); OutStreamer->EmitIntValue(4, 4); // data size OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data - EmitAlignment(WordSize == 4 ? 2 : 3); // padding + EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding OutStreamer->endSection(Nt); OutStreamer->SwitchSection(Cur); diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index 3dcc1015dc7c..69c6b3356cbb 100644 --- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -35,6 +35,7 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -390,7 +391,7 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, MachineMemOperand *LMMO = *LoadInst->memoperands_begin(); MachineMemOperand *SMMO = *StoreInst->memoperands_begin(); - unsigned Reg1 = MRI->createVirtualRegister( + Register Reg1 = MRI->createVirtualRegister( TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent()))); MachineInstr *NewLoad = BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), diff --git a/lib/Target/X86/X86AvoidTrailingCall.cpp b/lib/Target/X86/X86AvoidTrailingCall.cpp new file mode 100644 index 000000000000..fb4f9e2901dc --- /dev/null +++ b/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -0,0 +1,108 @@ +//===----- X86AvoidTrailingCall.cpp - Insert int3 after trailing calls ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The Windows x64 unwinder has trouble unwinding the stack when a return +// address points to the end of the function. This pass maintains the invariant +// that every return address is inside the bounds of its parent function or +// funclet by inserting int3 if the last instruction would otherwise be a call. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +#define DEBUG_TYPE "x86-avoid-trailing-call" + +using namespace llvm; + +namespace { + +class X86AvoidTrailingCallPass : public MachineFunctionPass { +public: + X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + StringRef getPassName() const override { + return "X86 avoid trailing call pass"; + } + static char ID; +}; + +char X86AvoidTrailingCallPass::ID = 0; + +} // end anonymous namespace + +FunctionPass *llvm::createX86AvoidTrailingCallPass() { + return new X86AvoidTrailingCallPass(); +} + +// A real instruction is a non-meta, non-pseudo instruction. Some pseudos +// expand to nothing, and some expand to code. This logic conservatively assumes +// they might expand to nothing. +static bool isRealInstruction(MachineInstr &MI) { + return !MI.isPseudo() && !MI.isMetaInstruction(); +} + +// Return true if this is a call instruction, but not a tail call. +static bool isCallInstruction(const MachineInstr &MI) { + return MI.isCall() && !MI.isReturn(); +} + +bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86InstrInfo &TII = *STI.getInstrInfo(); + assert(STI.isTargetWin64() && "pass only runs on Win64"); + + // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops + // before epilogues. + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + // Look for basic blocks that precede funclet entries or are at the end of + // the function. + MachineBasicBlock *NextMBB = MBB.getNextNode(); + if (NextMBB && !NextMBB->isEHFuncletEntry()) + continue; + + // Find the last real instruction in this block, or previous blocks if this + // block is empty. + MachineBasicBlock::reverse_iterator LastRealInstr; + for (MachineBasicBlock &RMBB : + make_range(MBB.getReverseIterator(), MF.rend())) { + LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction); + if (LastRealInstr != RMBB.rend()) + break; + } + + // Do nothing if this function or funclet has no instructions. + if (LastRealInstr == MF.begin()->rend()) + continue; + + // If this is a call instruction, insert int3 right after it with the same + // DebugLoc. Convert back to a forward iterator and advance the insertion + // position once. + if (isCallInstruction(*LastRealInstr)) { + LLVM_DEBUG({ + dbgs() << "inserting int3 after trailing call instruction:\n"; + LastRealInstr->dump(); + dbgs() << '\n'; + }); + + MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse()); + BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(), + TII.get(X86::INT3)); + Changed = true; + } + } + + return Changed; +} diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 4df849a2e14c..ad7e32b4efc8 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -155,12 +155,22 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // This is bad, and breaks SP adjustment. // So, check that all of the frames in the function are closed inside // the same block, and, for good measure, that there are no nested frames. + // + // If any call allocates more argument stack memory than the stack + // probe size, don't do this optimization. Otherwise, this pass + // would need to synthesize additional stack probe calls to allocate + // memory for arguments. unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + bool UseStackProbe = + !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty(); + unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { if (MI.getOpcode() == FrameSetupOpcode) { + if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe) + return false; if (InsideFrameSequence) return false; InsideFrameSequence = true; @@ -325,8 +335,8 @@ X86CallFrameOptimization::classifyInstruction( for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; - unsigned int Reg = MO.getReg(); - if (!RegInfo.isPhysicalRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isPhysicalRegister(Reg)) continue; if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister())) return Exit; @@ -370,7 +380,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, while (I->getOpcode() == X86::LEA32r || I->isDebugInstr()) ++I; - unsigned StackPtr = RegInfo.getStackRegister(); + Register StackPtr = RegInfo.getStackRegister(); auto StackPtrCopyInst = MBB.end(); // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual // register. If it's there, use that virtual register as stack pointer @@ -443,8 +453,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, for (const MachineOperand &MO : I->uses()) { if (!MO.isReg()) continue; - unsigned int Reg = MO.getReg(); - if (RegInfo.isPhysicalRegister(Reg)) + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) UsedRegs.insert(Reg); } } @@ -524,12 +534,12 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, break; case X86::MOV32mr: case X86::MOV64mr: { - unsigned int Reg = PushOp.getReg(); + Register Reg = PushOp.getReg(); // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg // in preparation for the PUSH64. The upper 32 bits can be undef. if (Is64Bit && Store->getOpcode() == X86::MOV32mr) { - unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass); Reg = MRI->createVirtualRegister(&X86::GR64RegClass); BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg); BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg) @@ -598,7 +608,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( // movl %eax, (%esp) // call // Get rid of those with prejudice. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return nullptr; // Make sure this is the only use of Reg. diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index b16b3839c85a..7ee637cfd523 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -102,6 +102,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { DL(MIRBuilder.getMF().getDataLayout()), STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {} + bool isIncomingArgumentHandler() const override { return false; } + Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0)); @@ -155,8 +157,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - const CallLowering::ArgInfo &Info, CCState &State) override { - bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, + CCState &State) override { + bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); StackSize = State.getNextStackOffset(); static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, @@ -229,7 +232,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { : ValueHandler(MIRBuilder, MRI, AssignFn), DL(MIRBuilder.getMF().getDataLayout()) {} - bool isArgumentHandler() const override { return true; } + bool isIncomingArgumentHandler() const override { return true; } Register getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { @@ -237,7 +240,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - unsigned AddrReg = MRI.createGenericVirtualRegister( + Register AddrReg = MRI.createGenericVirtualRegister( LLT::pointer(0, DL.getPointerSizeInBits(0))); MIRBuilder.buildFrameIndex(AddrReg, FI); return AddrReg; @@ -301,6 +304,7 @@ struct FormalArgHandler : public IncomingValueHandler { : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMRI()->addLiveIn(PhysReg); MIRBuilder.getMBB().addLiveIn(PhysReg); } }; @@ -372,10 +376,7 @@ bool X86CallLowering::lowerFormalArguments( } bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, - CallingConv::ID CallConv, - const MachineOperand &Callee, - const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const { + CallLoweringInfo &Info) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -385,8 +386,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, auto TRI = STI.getRegisterInfo(); // Handle only Linux C, X86_64_SysV calling conventions for now. - if (!STI.isTargetLinux() || - !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV)) + if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C || + Info.CallConv == CallingConv::X86_64_SysV)) return false; unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); @@ -395,18 +396,19 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. bool Is64Bit = STI.is64Bit(); - unsigned CallOpc = Callee.isReg() + unsigned CallOpc = Info.Callee.isReg() ? (Is64Bit ? X86::CALL64r : X86::CALL32r) : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); - auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask( - TRI->getCallPreservedMask(MF, CallConv)); + auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc) + .add(Info.Callee) + .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv)); SmallVector<ArgInfo, 8> SplitArgs; - for (const auto &OrigArg : OrigArgs) { + for (const auto &OrigArg : Info.OrigArgs) { // TODO: handle not simple cases. - if (OrigArg.Flags.isByVal()) + if (OrigArg.Flags[0].isByVal()) return false; if (OrigArg.Regs.size() > 1) @@ -423,8 +425,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; - bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed; - if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) { + bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed; + if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -445,23 +447,24 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. - if (Callee.isReg()) + if (Info.Callee.isReg()) MIB->getOperand(0).setReg(constrainOperandRegClass( MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), - *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0)); + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee, + 0)); // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. - if (!OrigRet.Ty->isVoidTy()) { - if (OrigRet.Regs.size() > 1) + if (!Info.OrigRet.Ty->isVoidTy()) { + if (Info.OrigRet.Regs.size() > 1) return false; SplitArgs.clear(); SmallVector<Register, 8> NewRegs; - if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI, + if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI, [&](ArrayRef<Register> Regs) { NewRegs.assign(Regs.begin(), Regs.end()); })) @@ -472,7 +475,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, return false; if (!NewRegs.empty()) - MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs); + MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs); } CallSeqStart.addImm(Handler.getStackSize()) diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index 0445331bc3ff..444a0c7d0122 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -34,9 +34,8 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<ArrayRef<Register>> VRegs) const override; - bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, - const MachineOperand &Callee, const ArgInfo &OrigRet, - ArrayRef<ArgInfo> OrigArgs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; private: /// A function of this type is used to perform value split action. diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 1c3034a5116a..4c49d68bec99 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -433,6 +433,7 @@ defm X86_SysV64_RegCall : def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, @@ -1000,6 +1001,7 @@ def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, + CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>, diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp index a61fa3246f09..5123853f5455 100644 --- a/lib/Target/X86/X86CmovConversion.cpp +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -436,8 +436,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( // Checks for "isUse()" as "uses()" returns also implicit definitions. if (!MO.isReg() || !MO.isUse()) continue; - unsigned Reg = MO.getReg(); - auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)]; + Register Reg = MO.getReg(); + auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)]; if (MachineInstr *DefMI = RDM.lookup(Reg)) { OperandToDefMap[&MO] = DefMI; DepthInfo Info = DepthMap.lookup(DefMI); @@ -456,8 +456,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.isDef()) continue; - unsigned Reg = MO.getReg(); - RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI; + Register Reg = MO.getReg(); + RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI; } unsigned Latency = TSchedModel.computeInstrLatency(&MI); @@ -710,7 +710,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // Skip any CMOVs in this group which don't load from memory. if (!MI.mayLoad()) { // Remember the false-side register input. - unsigned FalseReg = + Register FalseReg = MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg(); // Walk back through any intermediate cmovs referenced. while (true) { @@ -753,7 +753,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches( // Get a fresh register to use as the destination of the MOV. const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg()); - unsigned TmpReg = MRI->createVirtualRegister(RC); + Register TmpReg = MRI->createVirtualRegister(RC); SmallVector<MachineInstr *, 4> NewMIs; bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg, @@ -810,9 +810,9 @@ void X86CmovConverterPass::convertCmovInstsToBranches( DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned Op1Reg = MIIt->getOperand(1).getReg(); - unsigned Op2Reg = MIIt->getOperand(2).getReg(); + Register DestReg = MIIt->getOperand(0).getReg(); + Register Op1Reg = MIIt->getOperand(1).getReg(); + Register Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are processing is the opposite condition from the jump we // generated, then we have to swap the operands for the PHI that is going to diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp index 9dea94f1368d..1bf2d5ba7b8f 100644 --- a/lib/Target/X86/X86CondBrFolding.cpp +++ b/lib/Target/X86/X86CondBrFolding.cpp @@ -564,7 +564,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) { Modified = false; break; } - return llvm::make_unique<TargetMBBInfo>(TargetMBBInfo{ + return std::make_unique<TargetMBBInfo>(TargetMBBInfo{ TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly}); } diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index 18bbfa32e11b..b4cf5cafbc6e 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -182,7 +182,7 @@ public: MachineBasicBlock *MBB = MI->getParent(); auto &DL = MI->getDebugLoc(); - unsigned Reg = MRI->createVirtualRegister( + Register Reg = MRI->createVirtualRegister( TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), *MBB->getParent())); MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); @@ -219,13 +219,13 @@ public: // Don't allow copies to/flow GR8/GR16 physical registers. // FIXME: Is there some better way to support this? - unsigned DstReg = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg) && + Register DstReg = MI->getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg) && (X86::GR8RegClass.contains(DstReg) || X86::GR16RegClass.contains(DstReg))) return false; - unsigned SrcReg = MI->getOperand(1).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + Register SrcReg = MI->getOperand(1).getReg(); + if (Register::isPhysicalRegister(SrcReg) && (X86::GR8RegClass.contains(SrcReg) || X86::GR16RegClass.contains(SrcReg))) return false; @@ -241,7 +241,7 @@ public: // Physical registers will not be converted. Assume that converting the // COPY to the destination domain will eventually result in a actual // instruction. - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + if (Register::isPhysicalRegister(MO.getReg())) return 1; RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()), @@ -436,7 +436,7 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg, if (EnclosedEdges.count(Reg)) return; - if (!TargetRegisterInfo::isVirtualRegister(Reg)) + if (!Register::isVirtualRegister(Reg)) return; if (!MRI->hasOneDef(Reg)) @@ -593,8 +593,8 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { if (!DefOp.isReg()) continue; - unsigned DefReg = DefOp.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { + Register DefReg = DefOp.getReg(); + if (!Register::isVirtualRegister(DefReg)) { C.setAllIllegal(); continue; } @@ -751,7 +751,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { // Go over all virtual registers and calculate a closure. unsigned ClosureID = 0; for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx); + unsigned Reg = Register::index2VirtReg(Idx); // GPR only current source domain supported. if (!isGPR(MRI->getRegClass(Reg))) diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp index 58680f1815bb..24c8e6d6f6eb 100755 --- a/lib/Target/X86/X86EvexToVex.cpp +++ b/lib/Target/X86/X86EvexToVex.cpp @@ -131,7 +131,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) && "ZMM instructions should not be in the EVEX->VEX tables"); diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index b8624b40f2f7..9126a1fbea52 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -194,7 +194,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case X86::TCRETURNmi64: { bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; MachineOperand &JumpTarget = MBBI->getOperand(0); - MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands + : 1); assert(StackAdjust.isImm() && "Expecting immediate value."); // Adjust stack pointer. @@ -259,7 +260,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, ? X86::TAILJMPm : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); - for (unsigned i = 0; i != 5; ++i) + for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { JumpTarget.setIsKill(); @@ -274,7 +275,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); - MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI); + MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); @@ -287,7 +288,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, assert(DestAddr.isReg() && "Offset should be in register!"); const bool Uses64BitFramePtr = STI->isTarget64BitLP64() || STI->isTargetNaCl64(); - unsigned StackPtr = TRI->getStackRegister(); + Register StackPtr = TRI->getStackRegister(); BuildMI(MBB, MBBI, DL, TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) .addReg(DestAddr.getReg()); @@ -347,7 +348,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // actualcmpxchg Addr // [E|R]BX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); - unsigned SaveRbx = MBBI->getOperand(7).getReg(); + Register SaveRbx = MBBI->getOperand(7).getReg(); unsigned ActualInArg = Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 7b9ce0271205..e5e089d07d55 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1160,6 +1160,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && + CC != CallingConv::Tail && CC != CallingConv::X86_FastCall && CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && @@ -1173,7 +1174,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || + CC == CallingConv::Tail) return false; // Let SDISel handle vararg functions. @@ -1241,7 +1243,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { } // Make the copy. - unsigned DstReg = VA.getLocReg(); + Register DstReg = VA.getLocReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) @@ -3157,7 +3159,7 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; if (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE) + CC == CallingConv::HiPE || CC == CallingConv::Tail) return 0; if (CS) @@ -3208,6 +3210,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { default: return false; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Tail: case CallingConv::WebKit_JS: case CallingConv::Swift: case CallingConv::X86_FastCall: @@ -3224,7 +3227,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. - if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) + if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) || + CC == CallingConv::Tail) return false; // Don't know how to handle Win64 varargs yet. Nothing special needed for @@ -3387,6 +3391,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { case CCValAssign::SExtUpper: case CCValAssign::ZExtUpper: case CCValAssign::FPExt: + case CCValAssign::Trunc: llvm_unreachable("Unexpected loc info!"); case CCValAssign::Indirect: // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully @@ -3547,7 +3552,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; - unsigned SrcReg = VA.getLocReg(); + Register SrcReg = VA.getLocReg(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index bf541d933790..9f7c4afde760 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -80,7 +80,7 @@ class FixupBWInstPass : public MachineFunctionPass { /// destination register of the MachineInstr passed in. It returns true if /// that super register is dead just prior to \p OrigMI, and false if not. bool getSuperRegDestIfDead(MachineInstr *OrigMI, - unsigned &SuperDestReg) const; + Register &SuperDestReg) const; /// Change the MachineInstr \p MI into the equivalent extending load to 32 bit /// register if it is safe to do so. Return the replacement instruction if @@ -92,6 +92,12 @@ class FixupBWInstPass : public MachineFunctionPass { /// nullptr. MachineInstr *tryReplaceCopy(MachineInstr *MI) const; + /// Change the MachineInstr \p MI into the equivalent extend to 32 bit + /// register if it is safe to do so. Return the replacement instruction if + /// OK, otherwise return nullptr. + MachineInstr *tryReplaceExtend(unsigned New32BitOpcode, + MachineInstr *MI) const; + // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if // possible. Return the replacement instruction if OK, return nullptr // otherwise. @@ -169,10 +175,10 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { /// /// If so, return that super register in \p SuperDestReg. bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, - unsigned &SuperDestReg) const { + Register &SuperDestReg) const { auto *TRI = &TII->getRegisterInfo(); - unsigned OrigDestReg = OrigMI->getOperand(0).getReg(); + Register OrigDestReg = OrigMI->getOperand(0).getReg(); SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32); const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg); @@ -232,12 +238,12 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, // %ax = KILL %ax, implicit killed %eax // RET 0, %ax unsigned Opc = OrigMI->getOpcode(); (void)Opc; - // These are the opcodes currently handled by the pass, if something - // else will be added we need to ensure that new opcode has the same - // properties. - assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr || - Opc == X86::MOV16rr) && - "Unexpected opcode."); + // These are the opcodes currently known to work with the code below, if + // something // else will be added we need to ensure that new opcode has the + // same properties. + if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr && + Opc != X86::MOV16rr) + return false; bool IsDefined = false; for (auto &MO: OrigMI->implicit_operands()) { @@ -247,7 +253,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!"); if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg())) - IsDefined = true; + IsDefined = true; // If MO is a use of any part of the destination register but is not equal // to OrigDestReg or one of its subregisters, we cannot use SuperDestReg. @@ -268,7 +274,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode, MachineInstr *MI) const { - unsigned NewDestReg; + Register NewDestReg; // We are going to try to rewrite this load to a larger zero-extending // load. This is safe if all portions of the 32 bit super-register @@ -295,11 +301,11 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const { auto &OldDest = MI->getOperand(0); auto &OldSrc = MI->getOperand(1); - unsigned NewDestReg; + Register NewDestReg; if (!getSuperRegDestIfDead(MI, NewDestReg)) return nullptr; - unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32); + Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32); // This is only correct if we access the same subregister index: otherwise, // we could try to replace "movb %ah, %al" with "movl %eax, %eax". @@ -326,6 +332,33 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const { return MIB; } +MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode, + MachineInstr *MI) const { + Register NewDestReg; + if (!getSuperRegDestIfDead(MI, NewDestReg)) + return nullptr; + + // Don't interfere with formation of CBW instructions which should be a + // shorter encoding than even the MOVSX32rr8. It's also immunte to partial + // merge issues on Intel CPUs. + if (MI->getOpcode() == X86::MOVSX16rr8 && + MI->getOperand(0).getReg() == X86::AX && + MI->getOperand(1).getReg() == X86::AL) + return nullptr; + + // Safe to change the instruction. + MachineInstrBuilder MIB = + BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg); + + unsigned NumArgs = MI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) + MIB.add(MI->getOperand(i)); + + MIB.setMemRefs(MI->memoperands()); + + return MIB; +} + MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const { // See if this is an instruction of the type we are currently looking for. @@ -355,6 +388,15 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI, // of the register. return tryReplaceCopy(MI); + case X86::MOVSX16rr8: + return tryReplaceExtend(X86::MOVSX32rr8, MI); + case X86::MOVSX16rm8: + return tryReplaceExtend(X86::MOVSX32rm8, MI); + case X86::MOVZX16rr8: + return tryReplaceExtend(X86::MOVZX32rr8, MI); + case X86::MOVZX16rm8: + return tryReplaceExtend(X86::MOVZX32rm8, MI); + default: // nothing to do here. break; diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 041529a0be68..543dc8b00fa0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass { /// - LEA that uses RIP relative addressing mode /// - LEA that uses 16-bit addressing mode " /// This function currently handles the first 2 cases only. - MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB); + void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, bool OptIncDec); /// Look for LEAs that are really two address LEAs that we might be able to /// turn into regular ADD instructions. @@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) continue; - if (IsSlowLEA) { + if (IsSlowLEA) processInstructionForSlowLEA(I, MBB); - } else if (IsSlow3OpsLEA) { - if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) { - MBB.erase(I); - I = NewMI; - } - } + else if (IsSlow3OpsLEA) + processInstrForSlow3OpLEA(I, MBB, OptIncDec); } // Second pass for creating LEAs. This may reverse some of the @@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) { Reg == X86::R13D || Reg == X86::R13; } -static inline bool isRegOperand(const MachineOperand &Op) { - return Op.isReg() && Op.getReg() != X86::NoRegister; -} - /// Returns true if this LEA uses base an index registers, and the base register /// is known to be inefficient for the subtarget. // TODO: use a variant scheduling class to model the latency profile // of LEA instructions, and implement this logic as a scheduling predicate. static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, const MachineOperand &Index) { - return Base.isReg() && isInefficientLEAReg(Base.getReg()) && - isRegOperand(Index); + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() && + Index.getReg() != X86::NoRegister; } static inline bool hasLEAOffset(const MachineOperand &Offset) { @@ -372,9 +364,9 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I, !TII->isSafeToClobberEFLAGS(MBB, I)) return false; - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned BaseReg = Base.getReg(); - unsigned IndexReg = Index.getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register BaseReg = Base.getReg(); + Register IndexReg = Index.getReg(); // Don't change stack adjustment LEAs. if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP)) @@ -500,9 +492,9 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, if (Segment.getReg() != 0 || !Offset.isImm() || !TII->isSafeToClobberEFLAGS(MBB, I)) return; - const unsigned DstR = Dst.getReg(); - const unsigned SrcR1 = Base.getReg(); - const unsigned SrcR2 = Index.getReg(); + const Register DstR = Dst.getReg(); + const Register SrcR1 = Base.getReg(); + const Register SrcR2 = Index.getReg(); if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR)) return; if (Scale.getImm() > 1) @@ -534,111 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I, } } -MachineInstr * -FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, - MachineBasicBlock &MBB) { +void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I, + MachineBasicBlock &MBB, + bool OptIncDec) { + MachineInstr &MI = *I; const unsigned LEAOpcode = MI.getOpcode(); - const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Dest = MI.getOperand(0); const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg); const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt); const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg); const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp); const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg); - if (!(TII->isThreeOperandsLEA(MI) || - hasInefficientLEABaseReg(Base, Index)) || + if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) || !TII->isSafeToClobberEFLAGS(MBB, MI) || Segment.getReg() != X86::NoRegister) - return nullptr; + return; + + Register DestReg = Dest.getReg(); + Register BaseReg = Base.getReg(); + Register IndexReg = Index.getReg(); + + if (MI.getOpcode() == X86::LEA64_32r) { + if (BaseReg != 0) + BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit); + if (IndexReg != 0) + IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit); + } - unsigned DstR = Dst.getReg(); - unsigned BaseR = Base.getReg(); - unsigned IndexR = Index.getReg(); - unsigned SSDstR = - (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; bool IsScale1 = Scale.getImm() == 1; - bool IsInefficientBase = isInefficientLEAReg(BaseR); - bool IsInefficientIndex = isInefficientLEAReg(IndexR); + bool IsInefficientBase = isInefficientLEAReg(BaseReg); + bool IsInefficientIndex = isInefficientLEAReg(IndexReg); // Skip these cases since it takes more than 2 instructions // to replace the LEA instruction. - if (IsInefficientBase && SSDstR == BaseR && !IsScale1) - return nullptr; - if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && - (IsInefficientIndex || !IsScale1)) - return nullptr; - - const DebugLoc DL = MI.getDebugLoc(); - const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); - const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + if (IsInefficientBase && DestReg == BaseReg && !IsScale1) + return; LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";); + MachineInstr *NewMI = nullptr; + // First try to replace LEA with one or two (for the 3-op LEA case) // add instructions: // 1.lea (%base,%index,1), %base => add %index,%base // 2.lea (%base,%index,1), %index => add %base,%index - if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { - const MachineOperand &Src = DstR == BaseR ? Index : Base; - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); - LLVM_DEBUG(NewMI->dump();); - // Create ADD instruction for the Offset in case of 3-Ops LEA. - if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) { + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + if (DestReg != BaseReg) + std::swap(BaseReg, IndexReg); + + if (MI.getOpcode() == X86::LEA64_32r) { + // TODO: Do we need the super register implicit use? + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg) + .addReg(Base.getReg(), RegState::Implicit) + .addReg(Index.getReg(), RegState::Implicit); + } else { + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(BaseReg) + .addReg(IndexReg); } - return NewMI; - } - // If the base is inefficient try switching the index and base operands, - // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: - // lea offset(%base,%index,scale),%dst => - // lea (%base,%index,scale); add offset,%dst - if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .add(IsInefficientBase ? Index : Base) - .add(Scale) - .add(IsInefficientBase ? Base : Index) - .addImm(0) - .add(Segment); + } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); LLVM_DEBUG(NewMI->dump();); + } + + // If either replacement succeeded above, add the offset if needed, then + // replace the instruction. + if (NewMI) { // Create ADD instruction for the Offset in case of 3-Ops LEA. if (hasLEAOffset(Offset)) { - NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); - LLVM_DEBUG(NewMI->dump();); + if (OptIncDec && Offset.isImm() && + (Offset.getImm() == 1 || Offset.getImm() == -1)) { + unsigned NewOpc = + getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg); + LLVM_DEBUG(NewMI->dump();); + } else { + unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset); + NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Offset); + LLVM_DEBUG(NewMI->dump();); + } } - return NewMI; + + MBB.erase(I); + I = NewMI; + return; } + // Handle the rest of the cases with inefficient base register: - assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!"); assert(IsInefficientBase && "efficient base should be handled already!"); + // FIXME: Handle LEA64_32r. + if (LEAOpcode == X86::LEA64_32r) + return; + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst if (IsScale1 && !hasLEAOffset(Offset)) { - bool BIK = Base.isKill() && BaseR != IndexR; - TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK); + bool BIK = Base.isKill() && BaseReg != IndexReg; + TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK); LLVM_DEBUG(MI.getPrevNode()->dump();); - MachineInstr *NewMI = - BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Index); LLVM_DEBUG(NewMI->dump();); - return NewMI; + return; } + // lea offset(%base,%index,scale), %dst => // lea offset( ,%index,scale), %dst; add %base,%dst - MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode)) - .add(Dst) - .addReg(0) - .add(Scale) - .add(Index) - .add(Offset) - .add(Segment); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode)) + .add(Dest) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); LLVM_DEBUG(NewMI->dump();); - NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + unsigned NewOpc = getADDrrFromLEA(MI.getOpcode()); + NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg) + .addReg(DestReg) + .add(Base); LLVM_DEBUG(NewMI->dump();); - return NewMI; + + MBB.erase(I); + I = NewMI; } diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp index e2d4d1ede6f3..cbde280aa280 100644 --- a/lib/Target/X86/X86FixupSetCC.cpp +++ b/lib/Target/X86/X86FixupSetCC.cpp @@ -136,8 +136,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit() ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass; - unsigned ZeroReg = MRI->createVirtualRegister(RC); - unsigned InsertReg = MRI->createVirtualRegister(RC); + Register ZeroReg = MRI->createVirtualRegister(RC); + Register InsertReg = MRI->createVirtualRegister(RC); // Initialize a register with 0. This must go before the eflags def BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0), diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp index 5ce3255ea96a..cfba06fb6533 100644 --- a/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -721,8 +721,9 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) { X86::CondCode Cond = X86::getCondFromSETCC(MI); - if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() && - TRI->isVirtualRegister(MI.getOperand(0).getReg())) { + if (Cond != X86::COND_INVALID && !MI.mayStore() && + MI.getOperand(0).isReg() && + Register::isVirtualRegister(MI.getOperand(0).getReg())) { assert(MI.getOperand(0).isDef() && "A non-storing SETcc should always define a register!"); CondRegs[Cond] = MI.getOperand(0).getReg(); @@ -739,7 +740,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs( unsigned X86FlagsCopyLoweringPass::promoteCondToReg( MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, X86::CondCode Cond) { - unsigned Reg = MRI->createVirtualRegister(PromoteRC); + Register Reg = MRI->createVirtualRegister(PromoteRC); auto SetI = BuildMI(TestMBB, TestPos, TestLoc, TII->get(X86::SETCCr), Reg).addImm(Cond); (void)SetI; @@ -813,7 +814,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic( MachineBasicBlock &MBB = *MI.getParent(); // Insert an instruction that will set the flag back to the desired value. - unsigned TmpReg = MRI->createVirtualRegister(PromoteRC); + Register TmpReg = MRI->createVirtualRegister(PromoteRC); auto AddI = BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri)) .addDef(TmpReg, RegState::Dead) @@ -974,7 +975,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( // Now we need to turn this into a bitmask. We do this by subtracting it from // zero. - unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); ZeroReg = AdjustReg(ZeroReg); @@ -999,7 +1000,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( default: llvm_unreachable("Invalid SETB_C* opcode!"); } - unsigned ResultReg = MRI->createVirtualRegister(&SetBRC); + Register ResultReg = MRI->createVirtualRegister(&SetBRC); BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) .addReg(ZeroReg) .addReg(ExtCondReg); diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 074cf21d03f5..fcfb5bc91314 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -288,8 +288,8 @@ namespace { // Check if a COPY instruction is using FP registers. static bool isFPCopy(MachineInstr &MI) { - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SrcReg = MI.getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); return X86::RFP80RegClass.contains(DstReg) || X86::RFP80RegClass.contains(SrcReg); @@ -313,7 +313,7 @@ FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } /// For example, this returns 3 for X86::FP3. static unsigned getFPReg(const MachineOperand &MO) { assert(MO.isReg() && "Expected an FP register!"); - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!"); return Reg - X86::FP0; } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index e310fe069117..1b469a814adc 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -35,8 +35,8 @@ using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, - unsigned StackAlignOverride) - : TargetFrameLowering(StackGrowsDown, StackAlignOverride, + MaybeAlign StackAlignOverride) + : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(), STI.is64Bit() ? -8 : -4), STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { // Cache a bunch of frame-related predicates for this subtarget. @@ -176,7 +176,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineOperand &MO = MBBI->getOperand(i); if (!MO.isReg() || MO.isDef()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) @@ -216,7 +216,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg != X86::EFLAGS) continue; @@ -995,11 +995,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO; bool NeedsDwarfCFI = !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry()); - unsigned FramePtr = TRI->getFrameRegister(MF); - const unsigned MachineFramePtr = + Register FramePtr = TRI->getFrameRegister(MF); + const Register MachineFramePtr = STI.isTarget64BitILP32() - ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; - unsigned BasePtr = TRI->getBaseRegister(); + ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; + Register BasePtr = TRI->getBaseRegister(); bool HasWinCFI = false; // Debug location must be unknown since the first debug location is used @@ -1016,14 +1016,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); - - // The default stack probe size is 4096 if the function has no stackprobesize - // attribute. - unsigned StackProbeSize = 4096; - if (Fn.hasFnAttribute("stack-probe-size")) - Fn.getFnAttribute("stack-probe-size") - .getValueAsString() - .getAsInteger(0, StackProbeSize); + unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is // used and an error code was pushed, since the x86-64 ABI requires a 16-byte @@ -1081,7 +1074,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int stackGrowth = -SlotSize; // Find the funclet establisher parameter - unsigned Establisher = X86::NoRegister; + Register Establisher = X86::NoRegister; if (IsClrFunclet) Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; else if (IsFunclet) @@ -1192,7 +1185,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; - unsigned Reg = MBBI->getOperand(0).getReg(); + Register Reg = MBBI->getOperand(0).getReg(); ++MBBI; if (!HasFP && NeedsDwarfCFI) { @@ -1396,9 +1389,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { + int Offset; unsigned IgnoredFrameReg; - int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); - Offset += SEHFrameOffset; + if (IsWin64Prologue && IsFunclet) + Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg); + else + Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) + + SEHFrameOffset; HasWinCFI = true; assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); @@ -1554,9 +1551,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { unsigned X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); // This is the size of the pushed CSRs. - unsigned CSSize = - MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + // This is the size of callee saved XMMs. + const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + unsigned XMMSize = WinEHXMMSlotInfo.size() * + TRI->getSpillSize(X86::VR128RegClass); // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = @@ -1576,7 +1577,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. - return FrameSizeMinusRBP - CSSize; + return FrameSizeMinusRBP + XMMSize - CSSize; } static bool isTailCallOpcode(unsigned Opc) { @@ -1597,9 +1598,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, DL = MBBI->getDebugLoc(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Is64BitILP32 = STI.isTarget64BitILP32(); - unsigned FramePtr = TRI->getFrameRegister(MF); + Register FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = - Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; + Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWin64CFI = @@ -1850,6 +1851,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset + FPDelta; } +int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, + int FI, unsigned &FrameReg) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); + const auto it = WinEHXMMSlotInfo.find(FI); + + if (it == WinEHXMMSlotInfo.end()) + return getFrameIndexReference(MF, FI, FrameReg); + + FrameReg = TRI->getStackRegister(); + return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; +} + int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, int FI, unsigned &FrameReg, int Adjustment) const { @@ -1948,6 +1963,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); unsigned CalleeSavedFrameSize = 0; + unsigned XMMCalleeSavedFrameSize = 0; + auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); @@ -1984,7 +2001,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. - unsigned FPReg = TRI->getFrameRegister(MF); + Register FPReg = TRI->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); @@ -2025,12 +2042,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( unsigned Size = TRI->getSpillSize(*RC); unsigned Align = TRI->getSpillAlignment(*RC); // ensure alignment - SpillSlotOffset -= std::abs(SpillSlotOffset) % Align; + assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86"); + SpillSlotOffset = -alignTo(-SpillSlotOffset, Align); + // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI.ensureMaxAlignment(Align); + + // Save the start offset and size of XMM in stack frame for funclets. + if (X86::VR128RegClass.contains(Reg)) { + WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize; + XMMCalleeSavedFrameSize += Size; + } } return true; @@ -2200,7 +2225,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, // Spill the BasePtr if it's used. if (TRI->hasBasePointer(MF)){ - unsigned BasePtr = TRI->getBaseRegister(); + Register BasePtr = TRI->getBaseRegister(); if (STI.isTarget64BitILP32()) BasePtr = getX86SubSuperRegister(BasePtr, 64); SavedRegs.set(BasePtr); @@ -2212,7 +2237,7 @@ HasNestArgument(const MachineFunction *MF) { const Function &F = MF->getFunction(); for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; I++) { - if (I->hasNestAttr()) + if (I->hasNestAttr() && !I->use_empty()) return true; } return false; @@ -2244,7 +2269,8 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || - CallingConvention == CallingConv::Fast) { + CallingConvention == CallingConv::Fast || + CallingConvention == CallingConv::Tail) { if (IsNested) report_fatal_error("Segmented stacks does not support fastcall with " "nested function."); @@ -2525,6 +2551,18 @@ static unsigned getHiPELiteral( + " required but not provided"); } +// Return true if there are no non-ehpad successors to MBB and there are no +// non-meta instructions between MBBI and MBB.end(). +static bool blockEndIsUnreachable(const MachineBasicBlock &MBB, + MachineBasicBlock::const_iterator MBBI) { + return std::all_of( + MBB.succ_begin(), MBB.succ_end(), + [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) && + std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) { + return MI.isMetaInstruction(); + }); +} + /// Erlang programs may need a special prologue to handle the stack size they /// might need at runtime. That is because Erlang/OTP does not implement a C /// stack but uses a custom implementation of hybrid stack/heap architecture. @@ -2758,7 +2796,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0; + uint64_t Amount = TII.getFrameSize(*I); uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0; I = MBB.erase(I); auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); @@ -2847,7 +2885,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return I; } - if (isDestroy && InternalAmt) { + if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. @@ -2912,8 +2950,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( "restoring EBP/ESI on non-32-bit target"); MachineFunction &MF = *MBB.getParent(); - unsigned FramePtr = TRI->getFrameRegister(MF); - unsigned BasePtr = TRI->getBaseRegister(); + Register FramePtr = TRI->getFrameRegister(MF); + Register BasePtr = TRI->getBaseRegister(); WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index d32746e3a36e..2103d6471ead 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -25,7 +25,7 @@ class X86RegisterInfo; class X86FrameLowering : public TargetFrameLowering { public: - X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride); + X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride); // Cached subtarget predicates. @@ -99,6 +99,8 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getWin64EHFrameIndexRef(const MachineFunction &MF, + int FI, unsigned &SPReg) const; int getFrameIndexReferenceSP(const MachineFunction &MF, int FI, unsigned &SPReg, int Adjustment) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 95d31e62cafc..5b546d42d98a 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -253,6 +253,11 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -362,6 +367,11 @@ namespace { if (User->getNumOperands() != 2) continue; + // If this can match to INC/DEC, don't count it as a use. + if (User->getOpcode() == ISD::ADD && + (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0)))) + continue; + // Immediates that are used for offsets as part of stack // manipulation should be left alone. These are typically // used to indicate SP offsets for argument passing and @@ -502,8 +512,10 @@ namespace { bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); + bool combineIncDecVector(SDNode *Node); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); + bool tryMatchBitSelect(SDNode *N); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); @@ -746,7 +758,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { return false; LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); if (!LD || - LD->isVolatile() || + !LD->isSimple() || LD->getAddressingMode() != ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD) return false; @@ -873,10 +885,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() { case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(N); - SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, - N->getValueType(0), - N->getOperand(0), - CurDAG->getConstant(Imm, dl, MVT::i8)); + SDValue Res = CurDAG->getNode( + X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), + CurDAG->getTargetConstant(Imm, dl, MVT::i8)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; @@ -2305,10 +2316,10 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, return false; // We can allow a full vector load here since narrowing a load is ok unless - // it's volatile. + // it's volatile or atomic. if (ISD::isNON_EXTLoad(N.getNode())) { LoadSDNode *LD = cast<LoadSDNode>(N); - if (!LD->isVolatile() && + if (LD->isSimple() && IsProfitableToFold(N, LD, Root) && IsLegalToFold(N, Parent, Root, OptLevel)) { PatternNodeWithChain = N; @@ -2464,6 +2475,37 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N, Complexity += 2; } + // Heuristic: try harder to form an LEA from ADD if the operands set flags. + // Unlike ADD, LEA does not affect flags, so we will be less likely to require + // duplicating flag-producing instructions later in the pipeline. + if (N.getOpcode() == ISD::ADD) { + auto isMathWithFlags = [](SDValue V) { + switch (V.getOpcode()) { + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: + /* TODO: These opcodes can be added safely, but we may want to justify + their inclusion for different reasons (better for reg-alloc). + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + */ + // Value 1 is the flag output of the node - verify it's not dead. + return !SDValue(V.getNode(), 1).use_empty(); + default: + return false; + } + }; + // TODO: This could be an 'or' rather than 'and' to make the transform more + // likely to happen. We might want to factor in whether there's a + // load folding opportunity for the math op that disappears with LEA. + if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) + Complexity++; + } + if (AM.Disp) Complexity++; @@ -2544,6 +2586,7 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { + assert(Root && P && "Unknown root/parent nodes"); if (!ISD::isNON_EXTLoad(N.getNode()) || !IsProfitableToFold(N, P, Root) || !IsLegalToFold(N, P, Root, OptLevel)) @@ -2553,6 +2596,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + assert(Root && P && "Unknown root/parent nodes"); + if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -3302,8 +3359,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { SDValue ImplDef = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); - NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef, - NBits); + + SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); + insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); + NBits = SDValue( + CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, + NBits, SRIdxVal), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); if (Subtarget->hasBMI2()) { @@ -3400,8 +3461,9 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM // hoisting the move immediate would make it worthwhile with a less optimal // BEXTR? - if (!Subtarget->hasTBM() && - !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) + bool PreferBEXTR = + Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); + if (!PreferBEXTR && !Subtarget->hasBMI2()) return nullptr; // Must have a shift right. @@ -3440,23 +3502,50 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { if (Shift + MaskSize > NVT.getSizeInBits()) return nullptr; - SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); - unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; - unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + // BZHI, if available, is always fast, unlike BEXTR. But even if we decide + // that we can't use BEXTR, it is only worthwhile using BZHI if the mask + // does not fit into 32 bits. Load folding is not a sufficient reason. + if (!PreferBEXTR && MaskSize <= 32) + return nullptr; - // BMI requires the immediate to placed in a register. - if (!Subtarget->hasTBM()) { - ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; - MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + SDValue Control; + unsigned ROpc, MOpc; + + if (!PreferBEXTR) { + assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); + // If we can't make use of BEXTR then we can't fuse shift+mask stages. + // Let's perform the mask first, and apply shift later. Note that we need to + // widen the mask to account for the fact that we'll apply shift afterwards! + Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); + ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; + MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; - New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0); + Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); + } else { + // The 'control' of BEXTR has the pattern of: + // [15...8 bit][ 7...0 bit] location + // [ bit count][ shift] name + // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 + Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + if (Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + } else { + assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); + // BMI requires the immediate to placed in a register. + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; + Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); + } } MachineSDNode *NewNode; SDValue Input = N0->getOperand(0); SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + SDValue Ops[] = { + Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. @@ -3464,7 +3553,15 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()}); } else { - NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New); + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); + } + + if (!PreferBEXTR) { + // We still need to apply the shift. + SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); + unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; + NewNode = + CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); } return NewNode; @@ -3735,6 +3832,52 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { return true; } +/// Convert vector increment or decrement to sub/add with an all-ones constant: +/// add X, <1, 1...> --> sub X, <-1, -1...> +/// sub X, <1, 1...> --> add X, <-1, -1...> +/// The all-ones vector constant can be materialized using a pcmpeq instruction +/// that is commonly recognized as an idiom (has no register dependency), so +/// that's better/smaller than loading a splat 1 constant. +bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) { + assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) && + "Unexpected opcode for increment/decrement transform"); + + EVT VT = Node->getValueType(0); + assert(VT.isVector() && "Should only be called for vectors."); + + SDValue X = Node->getOperand(0); + SDValue OneVec = Node->getOperand(1); + + APInt SplatVal; + if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue()) + return false; + + SDLoc DL(Node); + SDValue OneConstant, AllOnesVec; + + APInt Ones = APInt::getAllOnesValue(32); + assert(VT.getSizeInBits() % 32 == 0 && + "Expected bit count to be a multiple of 32"); + OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32); + insertDAGNode(*CurDAG, X, OneConstant); + + unsigned NumElts = VT.getSizeInBits() / 32; + assert(NumElts > 0 && "Expected to get non-empty vector."); + AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts), + DL, OneConstant); + insertDAGNode(*CurDAG, X, AllOnesVec); + + AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec); + insertDAGNode(*CurDAG, X, AllOnesVec); + + unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; + SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec); + + ReplaceNode(Node, NewNode.getNode()); + SelectCode(NewNode.getNode()); + return true; +} + /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large @@ -3975,12 +4118,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, if (CC != ISD::SETEQ && CC != ISD::SETNE) return false; - // See if we're comparing against zero. This should have been canonicalized - // to RHS during lowering. - if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode())) + SDValue SetccOp0 = Setcc.getOperand(0); + SDValue SetccOp1 = Setcc.getOperand(1); + + // Canonicalize the all zero vector to the RHS. + if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) + std::swap(SetccOp0, SetccOp1); + + // See if we're comparing against zero. + if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) return false; - SDValue N0 = Setcc.getOperand(0); + SDValue N0 = SetccOp0; MVT CmpVT = N0.getSimpleValueType(); MVT CmpSVT = CmpVT.getVectorElementType(); @@ -4027,13 +4176,14 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { // Look through single use bitcasts. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) - Src = Src.getOperand(0); - - if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { Parent = Src.getNode(); Src = Src.getOperand(0); - if (Src.getSimpleValueType() == CmpSVT) + } + + if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Src); + if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) return Src; } @@ -4045,17 +4195,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, bool FoldedBCast = false; if (!FoldedLoad && CanFoldLoads && (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { - SDNode *ParentNode = nullptr; + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); } // Try the other operand. if (!FoldedBCast) { + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedBCast) std::swap(Src0, Src1); } @@ -4125,7 +4276,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, // Update the chain. ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); @@ -4146,6 +4297,55 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, return true; } +// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it +// into vpternlog. +bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { + assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); + + MVT NVT = N->getSimpleValueType(0); + + // Make sure we support VPTERNLOG. + if (!NVT.isVector() || !Subtarget->hasAVX512()) + return false; + + // We need VLX for 128/256-bit. + if (!(Subtarget->hasVLX() || NVT.is512BitVector())) + return false; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Canonicalize AND to LHS. + if (N1.getOpcode() == ISD::AND) + std::swap(N0, N1); + + if (N0.getOpcode() != ISD::AND || + N1.getOpcode() != X86ISD::ANDNP || + !N0.hasOneUse() || !N1.hasOneUse()) + return false; + + // ANDN is not commutable, use it to pick down A and C. + SDValue A = N1.getOperand(0); + SDValue C = N1.getOperand(1); + + // AND is commutable, if one operand matches A, the other operand is B. + // Otherwise this isn't a match. + SDValue B; + if (N0.getOperand(0) == A) + B = N0.getOperand(1); + else if (N0.getOperand(1) == A) + B = N0.getOperand(0); + else + return false; + + SDLoc dl(N); + SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); + SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); + ReplaceNode(N, Ternlog.getNode()); + SelectCode(Ternlog.getNode()); + return true; +} + void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); @@ -4170,6 +4370,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned Opc = 0; switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_sse3_monitor: if (!Subtarget->hasSSE3()) break; @@ -4303,9 +4504,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (tryShrinkShlLogicImm(Node)) return; + if (Opcode == ISD::OR && tryMatchBitSelect(Node)) + return; + LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: { + if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() && + combineIncDecVector(Node)) + return; + // Try to avoid folding immediates with multiple uses for optsize. // This code tries to select to register form directly to avoid going // through the isel table which might fold the immediate. We can't change @@ -4333,6 +4541,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (!isInt<8>(Val) && !isInt<32>(Val)) break; + // If this can match to INC/DEC, let it go. + if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) + break; + // Check if we should avoid folding this immediate. if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) break; @@ -4610,7 +4822,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ClrReg = HiReg = X86::AH; - SExtOpcode = X86::CBW; + SExtOpcode = 0; // Not used. break; case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; @@ -4632,24 +4844,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) { bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; - if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) { + if (NVT == MVT::i8) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; MachineSDNode *Move; if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; - Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, - MVT::Other, Ops); + unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 + : X86::MOVZX16rm8; + Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); Chain = SDValue(Move, 1); ReplaceUses(N0.getValue(1), Chain); // Record the mem-refs CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()}); } else { - Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0); + unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 + : X86::MOVZX16rr8; + Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); Chain = CurDAG->getEntryNode(); } - Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0), + Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), SDValue()); InFlag = Chain.getValue(1); } else { @@ -4996,10 +5211,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(Node); - SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, - Node->getValueType(0), + SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0), Node->getOperand(0), - CurDAG->getConstant(Imm, dl, MVT::i8)); + CurDAG->getTargetConstant(Imm, dl, MVT::i8)); ReplaceNode(Node, Res.getNode()); SelectCode(Res.getNode()); return; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0b4bf687e6cf..ed975e9248a8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -65,17 +65,19 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); -static cl::opt<bool> ExperimentalVectorWideningLegalization( - "x86-experimental-vector-widening-legalization", cl::init(false), - cl::desc("Enable an experimental vector type legalization through widening " - "rather than promotion."), - cl::Hidden); - static cl::opt<int> ExperimentalPrefLoopAlignment( "x86-experimental-pref-loop-alignment", cl::init(4), - cl::desc("Sets the preferable loop alignment for experiments " - "(the last x86-experimental-pref-loop-alignment bits" - " of the loop header PC will be 0)."), + cl::desc( + "Sets the preferable loop alignment for experiments (as log2 bytes)" + "(the last x86-experimental-pref-loop-alignment bits" + " of the loop header PC will be 0)."), + cl::Hidden); + +// Added in 10.0. +static cl::opt<bool> EnableOldKNLABI( + "x86-enable-old-knl-abi", cl::init(false), + cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " + "one ZMM register on AVX512F, but not AVX512BW targets."), cl::Hidden); static cl::opt<bool> MulConstantOptimization( @@ -84,6 +86,13 @@ static cl::opt<bool> MulConstantOptimization( "SHIFT, LEA, etc."), cl::Hidden); +static cl::opt<bool> ExperimentalUnorderedISEL( + "x86-experimental-unordered-atomic-isel", cl::init(false), + cl::desc("Use LoadSDNode and StoreSDNode instead of " + "AtomicSDNode for unordered atomic loads and " + "stores respectively."), + cl::Hidden); + /// Call this when the user attempts to do something unsupported, like /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike /// report_fatal_error, so calling code should attempt to recover without @@ -196,7 +205,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Integer absolute. if (Subtarget.hasCMov()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); - setOperationAction(ISD::ABS , MVT::i32 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); } setOperationAction(ISD::ABS , MVT::i64 , Custom); @@ -214,14 +223,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); - if (Subtarget.is64Bit()) { - if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) - // f32/f64 are legal, f80 is custom. - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); - else - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); - setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); - } else if (!Subtarget.useSoftFloat()) { + if (!Subtarget.useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); @@ -277,29 +279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); - if (Subtarget.is64Bit()) { - if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { - // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); - } else { - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); - } - } else if (!Subtarget.useSoftFloat()) { - // Since AVX is a superset of SSE3, only check for SSE here. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE3()) - // Expand FP_TO_UINT into a select. - // FIXME: We would like to use a Custom expander here eventually to do - // the optimal thing for SSE vs. the default expansion in the legalizer. - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); - else - // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. - // With SSE3 we can use fisttpll to convert to a signed i64; without - // SSE, we're stuck with a fistpll. - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + if (!Subtarget.useSoftFloat()) { + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. @@ -345,11 +327,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); - setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); + setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter @@ -396,15 +378,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); + setTruncStoreAction(MVT::f128, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); @@ -638,17 +624,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87, except f128 in MMX. + // f80 always uses X87. if (UseX87) { - if (Subtarget.is64Bit() && Subtarget.hasMMX()) { - addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); - ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); - setOperationAction(ISD::FABS , MVT::f128, Custom); - setOperationAction(ISD::FNEG , MVT::f128, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); - } - addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -684,10 +661,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f80, Expand); } + // f128 uses xmm registers, but most operations require libcalls. + if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { + addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps + + setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FMA, MVT::f128, Expand); + + setOperationAction(ISD::FABS, MVT::f128, Custom); + setOperationAction(ISD::FNEG, MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FSINCOS, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + // We need to custom handle any FP_ROUND with an f128 input, but + // LegalizeDAG uses the result type to know when to run a custom handler. + // So we have to list all legal floating point result types here. + if (isTypeLegal(MVT::f32)) { + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + } + if (isTypeLegal(MVT::f64)) { + setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); + } + if (isTypeLegal(MVT::f80)) { + setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); + } + + setOperationAction(ISD::SETCC, MVT::f128, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f80, Expand); + } + // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); + setOperationAction(ISD::FPOW , MVT::f128 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); @@ -716,7 +743,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); @@ -754,7 +781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); @@ -797,6 +824,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -823,10 +852,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } setOperationAction(ISD::MUL, MVT::v2i8, Custom); - setOperationAction(ISD::MUL, MVT::v2i16, Custom); - setOperationAction(ISD::MUL, MVT::v2i32, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); - setOperationAction(ISD::MUL, MVT::v4i16, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -863,28 +889,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); - if (!ExperimentalVectorWideningLegalization) { - // Use widening instead of promotion. - for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8, - MVT::v4i16, MVT::v2i16 }) { - setOperationAction(ISD::UADDSAT, VT, Custom); - setOperationAction(ISD::SADDSAT, VT, Custom); - setOperationAction(ISD::USUBSAT, VT, Custom); - setOperationAction(ISD::SSUBSAT, VT, Custom); - } - } - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Provide custom widening for v2f32 setcc. This is really for VLX when - // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to - // type legalization changing the result type to v4i1 during widening. - // It works fine for SSE2 and is probably faster so no need to qualify with - // VLX support. - setOperationAction(ISD::SETCC, MVT::v2i32, Custom); - for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); @@ -904,19 +912,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } - // We support custom legalizing of sext and anyext loads for specific - // memory vector types which we can load as a scalar (or sequence of - // scalars) and extend in-register to a legal 128-bit vector type. For sext - // loads these must work with a single scalar load. - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); - } - for (auto VT : { MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); @@ -938,7 +933,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); // Custom legalize these to avoid over promotion or custom promotion. setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); @@ -991,18 +985,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); - if (ExperimentalVectorWideningLegalization) { - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); - } else { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom); - } + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. @@ -1069,22 +1059,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } - if (!ExperimentalVectorWideningLegalization) { - // Avoid narrow result types when widening. The legal types are listed - // in the next loop. - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); - } - } - // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); - if (!ExperimentalVectorWideningLegalization) - setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); @@ -1145,6 +1123,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom); + if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); @@ -1292,10 +1272,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STORE, VT, Custom); } - if (HasInt256) - setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - if (HasInt256) { + setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); @@ -1407,6 +1386,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); @@ -1433,12 +1414,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - if (ExperimentalVectorWideningLegalization) { - // Need to custom widen this if we don't have AVX512BW. - setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); - } + // Need to custom widen this if we don't have AVX512BW. + setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -1529,10 +1508,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } - // Need to custom split v32i16/v64i8 bitcasts. if (!Subtarget.hasBWI()) { + // Need to custom split v32i16/v64i8 bitcasts. setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); + + // Better to split these into two 256-bit ops. + setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); } if (Subtarget.hasVBMI2()) { @@ -1777,6 +1760,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSHR, VT, Custom); } } + + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } // We want to custom lower some of our intrinsics. @@ -1905,13 +1892,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). - setPrefLoopAlignment(ExperimentalPrefLoopAlignment); + setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment)); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; - setPrefFunctionAlignment(4); // 2^4 bytes. + setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); } @@ -1939,8 +1926,7 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; - if (ExperimentalVectorWideningLegalization && - VT.getVectorNumElements() != 1 && + if (VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; @@ -1950,19 +1936,62 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // v32i1 vectors should be promoted to v32i8 to match avx2. if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) + return MVT::i8; + // FIXME: Should we just make these types legal and custom split operations? + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && + Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + return MVT::v16i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // v32i1 vectors should be promoted to v32i8 to match avx2. if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) + return VT.getVectorNumElements(); + // FIXME: Should we just make these types legal and custom split operations? + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && + Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } +unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && + (!isPowerOf2_32(VT.getVectorNumElements()) || + (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || + (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { + RegisterVT = MVT::i8; + IntermediateVT = MVT::i1; + NumIntermediates = VT.getVectorNumElements(); + return NumIntermediates; + } + + return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, + NumIntermediates, RegisterVT); +} + EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext& Context, EVT VT) const { @@ -2060,6 +2089,11 @@ EVT X86TargetLowering::getOptimalMemOpType( if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { + // FIXME: Check if unaligned 64-byte accesses are slow. + if (Size >= 64 && Subtarget.hasAVX512() && + (Subtarget.getPreferVectorWidth() >= 512)) { + return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; + } // FIXME: Check if unaligned 32-byte accesses are slow. if (Size >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { @@ -2403,8 +2437,8 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, /// Breaks v64i1 value into two registers and adds the new node to the DAG static void Passv64i1ArgInRegs( - const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, - SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA, + const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, + SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); @@ -2537,7 +2571,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); - Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I], + Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I], Subtarget); assert(2 == RegsToPass.size() && @@ -2816,6 +2850,10 @@ SDValue X86TargetLowering::LowerCallResult( ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (CopyVT == MVT::f64 && + (Is64Bit && !Subtarget.hasSSE2())) { + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -2925,7 +2963,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || - CC == CallingConv::HHVM); + CC == CallingConv::HHVM || CC == CallingConv::Tail); } /// Return true if we might ever do TCO for calls with this calling convention. @@ -2951,7 +2989,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { - return GuaranteedTailCallOpt && canGuaranteeTCO(CC); + return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail; } bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { @@ -3405,7 +3443,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. - if (Subtarget.hasAVX512() && + if (Subtarget.useAVX512Regs() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; @@ -3577,6 +3615,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; + bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || + CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); @@ -3597,8 +3637,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (Attr.getValueAsString() == "true") isTailCall = false; - if (Subtarget.isPICStyleGOT() && - !MF.getTarget().Options.GuaranteedTailCallOpt) { + if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code @@ -3625,7 +3664,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Sibcalls are automatically detected tailcalls which do not require // ABI changes. - if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) + if (!IsGuaranteeTCO && isTailCall) IsSibcall = true; if (isTailCall) @@ -3657,8 +3696,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; - else if (MF.getTarget().Options.GuaranteedTailCallOpt && - canGuaranteeTCO(CallConv)) + else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -3782,8 +3820,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); // Split v64i1 value into two registers - Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I], - Subtarget); + Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); const TargetOptions &Options = DAG.getTarget().Options; @@ -4069,6 +4106,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); + // Save heapallocsite metadata. + if (CLI.CS) + if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite")) + DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); + // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, @@ -4190,7 +4232,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!TargetRegisterInfo::isVirtualRegister(VR)) + if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -4279,6 +4321,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); + bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || + CalleeCC == CallingConv::Tail; // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this @@ -4286,7 +4330,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( if (IsCalleeWin64 != IsCallerWin64) return false; - if (DAG.getTarget().Options.GuaranteedTailCallOpt) { + if (IsGuaranteeTCO) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; @@ -4413,7 +4457,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: @@ -4652,7 +4696,11 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, // X < 0 -> X == 0, jump on sign. return X86::COND_S; } - if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) { + // X >= 0 -> X == 0, jump on !sign. + return X86::COND_NS; + } + if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; @@ -4760,7 +4808,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); - Info.align = 1; + Info.align = Align::None(); Info.flags |= MachineMemOperand::MOStore; break; } @@ -4773,7 +4821,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); - Info.align = 1; + Info.align = Align::None(); Info.flags |= MachineMemOperand::MOLoad; break; } @@ -4785,7 +4833,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); - Info.align = 1; + Info.align = Align::None(); Info.flags |= MachineMemOperand::MOStore; break; } @@ -4811,6 +4859,8 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { + assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"); + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); @@ -4852,11 +4902,12 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } -bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const { +bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { // If we are using XMM registers in the ABI and the condition of the select is // a floating-point compare and we have blendv or conditional move, then it is // cheaper to select instead of doing a cross-register move and creating a // load that depends on the compare result. + bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); } @@ -4869,15 +4920,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { return true; } -bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { +bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; + // Find the type this will be legalized too. Otherwise we might prematurely + // convert this to shl+add/sub and then still have to type legalize those ops. + // Another choice would be to defer the decision for illegal types until + // after type legalization. But constant splat vectors of i64 can't make it + // through type legalization on 32-bit targets so we would need to special + // case vXi64. + while (getTypeAction(Context, VT) != TypeLegal) + VT = getTypeToTransformTo(Context, VT); + // If vector multiply is legal, assume that's faster than shl + add/sub. - // TODO: Multiply is a complex op with higher latency and lower througput in + // TODO: Multiply is a complex op with higher latency and lower throughput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. if (isOperationLegal(ISD::MUL, VT)) @@ -5022,6 +5083,33 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const { return Subtarget.hasSSE2(); } +bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { + return X.getValueType().isScalarInteger(); // 'bt' +} + +bool X86TargetLowering:: + shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const { + // Does baseline recommend not to perform the fold by default? + if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) + return false; + // For scalars this transform is always beneficial. + if (X.getValueType().isScalarInteger()) + return true; + // If all the shift amounts are identical, then transform is beneficial even + // with rudimentary SSE2 shifts. + if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) + return true; + // If we have AVX2 with it's powerful shift operations, then it's also good. + if (Subtarget.hasAVX2()) + return true; + // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. + return NewShiftOpcode == ISD::SHL; +} + bool X86TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N, CombineLevel Level) const { assert(((N->getOpcode() == ISD::SHL && @@ -5054,6 +5142,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { return true; } +bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget.isOSWindows()) + return false; + return true; +} + bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { // Any legal vector type can be splatted more efficiently than // loading/spilling from memory. @@ -5093,10 +5189,8 @@ static bool isUndefOrZero(int Val) { /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { - for (unsigned i = Pos, e = Pos + Size; i != e; ++i) - if (Mask[i] != SM_SentinelUndef) - return false; - return true; + return llvm::all_of(Mask.slice(Pos, Size), + [](int M) { return M == SM_SentinelUndef; }); } /// Return true if the mask creates a vector whose lower half is undefined. @@ -5119,10 +5213,7 @@ static bool isInRange(int Val, int Low, int Hi) { /// Return true if the value of any element in Mask falls within the specified /// range (L, H]. static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) { - for (int M : Mask) - if (isInRange(M, Low, Hi)) - return true; - return false; + return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); } /// Return true if Val is undef or if its value falls within the @@ -5133,12 +5224,9 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) { /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. -static bool isUndefOrInRange(ArrayRef<int> Mask, - int Low, int Hi) { - for (int M : Mask) - if (!isUndefOrInRange(M, Low, Hi)) - return false; - return true; +static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) { + return llvm::all_of( + Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); } /// Return true if Val is undef, zero or if its value falls within the @@ -5150,10 +5238,8 @@ static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) { - for (int M : Mask) - if (!isUndefOrZeroOrInRange(M, Low, Hi)) - return false; - return true; + return llvm::all_of( + Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); } /// Return true if every element in Mask, beginning @@ -5171,8 +5257,9 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, - unsigned Size, int Low) { - for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low) + unsigned Size, int Low, + int Step = 1) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; @@ -5182,10 +5269,8 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { - for (unsigned i = Pos, e = Pos + Size; i != e; ++i) - if (!isUndefOrZero(Mask[i])) - return false; - return true; + return llvm::all_of(Mask.slice(Pos, Size), + [](int M) { return isUndefOrZero(M); }); } /// Helper function to test whether a shuffle mask could be @@ -5357,6 +5442,8 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SDValue Vec; if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); + } else if (VT.isFloatingPoint()) { + Vec = DAG.getConstantFP(+0.0, dl, VT); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); @@ -5500,6 +5587,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && Idx == (VT.getVectorNumElements() / 2) && Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(1).getValueType() == SubVT && isNullConstant(Src.getOperand(2))) { Ops.push_back(Src.getOperand(1)); Ops.push_back(Sub); @@ -5593,7 +5681,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, WideOpVT), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } @@ -5609,14 +5697,14 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (IdxVal == 0) { // Zero lower bits of the Vec - SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); @@ -5628,7 +5716,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } @@ -5638,30 +5726,30 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, - DAG.getConstant(ShiftRight, dl, MVT::i8)); + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to opimitize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), + DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); + SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } @@ -5675,30 +5763,47 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Move the current value of the bit to be replace to the lsbs. - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); - // Xor with the new bit. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); - // Shift to MSB, filling bottom bits with 0. + + // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; - Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - // Shift to the final position, filling upper bits with 0. + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, - DAG.getConstant(ShiftRight, dl, MVT::i8)); - // Xor with original vector leaving the new value. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + + // Isolate the bits below the insertion point. + unsigned LowShift = NumElems - IdxVal; + SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + + // Isolate the bits after the last inserted bit. + unsigned HighShift = IdxVal + SubVecNumElems; + SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + + // Now OR all 3 pieces together. + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); + SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); + // Reduce to original width if needed. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } -static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT, - unsigned NumElems, SelectionDAG &DAG, - const SDLoc &dl, unsigned VectorWidth) { - SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth); - return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth); +static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, + const SDLoc &dl) { + assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch"); + EVT SubVT = V1.getValueType(); + EVT SubSVT = SubVT.getScalarType(); + unsigned SubNumElts = SubVT.getVectorNumElements(); + unsigned SubVectorWidth = SubVT.getSizeInBits(); + EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); + SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); + return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); } /// Returns a vector of specified type with all bits set. @@ -5755,6 +5860,34 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, return DAG.getNode(Opcode, DL, VT, In); } +// Match (xor X, -1) -> X. +// Match extract_subvector(xor X, -1) -> extract_subvector(X). +// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). +static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { + V = peekThroughBitcasts(V); + if (V.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + return V.getOperand(0); + if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { + if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { + Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), + Not, V.getOperand(1)); + } + } + SmallVector<SDValue, 2> CatOps; + if (collectConcatOps(V.getNode(), CatOps)) { + for (SDValue &CatOp : CatOps) { + SDValue NotCat = IsNOT(CatOp, DAG); + if (!NotCat) return SDValue(); + CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); + } + return SDValue(); +} + /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { @@ -6003,6 +6136,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } } + if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && + EltSizeInBits <= VT.getScalarSizeInBits()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op); + if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) + return false; + + SDValue Ptr = MemIntr->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!CNode || CNode->isMachineConstantPoolEntry() || + CNode->getOffset() != 0) + return false; + + if (const Constant *C = CNode->getConstVal()) { + unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); + if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { + if (UndefSrcElts[0]) + UndefSrcElts.setBits(0, NumSrcElts); + SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); + return CastBitData(UndefSrcElts, SrcEltBits); + } + } + } + // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { SmallVector<APInt, 16> SubEltBits; @@ -6123,7 +6287,9 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; } -static bool isConstantSplat(SDValue Op, APInt &SplatVal) { +namespace llvm { +namespace X86 { +bool isConstantSplat(SDValue Op, APInt &SplatVal) { APInt UndefElts; SmallVector<APInt, 16> EltBits; if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), @@ -6146,6 +6312,8 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) { return false; } +} // namespace X86 +} // namespace llvm static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, @@ -6551,13 +6719,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return true; } -/// Check a target shuffle mask's inputs to see if we can set any values to -/// SM_SentinelZero - this is for elements that are known to be zero -/// (not just zeroable) from their inputs. +/// Decode a target shuffle mask and inputs and see if any values are +/// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. -static bool setTargetShuffleZeroElements(SDValue N, - SmallVectorImpl<int> &Mask, - SmallVectorImpl<SDValue> &Ops) { +static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, + SmallVectorImpl<SDValue> &Ops, + APInt &KnownUndef, APInt &KnownZero) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; @@ -6566,15 +6733,17 @@ static bool setTargetShuffleZeroElements(SDValue N, if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) return false; + int Size = Mask.size(); SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; + KnownUndef = KnownZero = APInt::getNullValue(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Mask.size()) == 0 && "Illegal split of shuffle value type"); - unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size(); + unsigned EltSizeInBits = VT.getSizeInBits() / Size; // Extract known constant input data. APInt UndefSrcElts[2]; @@ -6585,12 +6754,18 @@ static bool setTargetShuffleZeroElements(SDValue N, getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], true, false)}; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { + for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. - if (M < 0) + if (M < 0) { + assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); + if (SM_SentinelUndef == M) + KnownUndef.setBit(i); + if (SM_SentinelZero == M) + KnownZero.setBit(i); continue; + } // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; @@ -6599,7 +6774,7 @@ static bool setTargetShuffleZeroElements(SDValue N, // We are referencing an UNDEF input. if (V.isUndef()) { - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); continue; } @@ -6612,31 +6787,64 @@ static bool setTargetShuffleZeroElements(SDValue N, int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) - Mask[i] = SM_SentinelZero; + KnownZero.setBit(i); continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) - Mask[i] = SM_SentinelUndef; + KnownUndef.setBit(i); else if (SrcEltBits[SrcIdx][M] == 0) - Mask[i] = SM_SentinelZero; + KnownZero.setBit(i); } } - assert(VT.getVectorNumElements() == Mask.size() && + assert(VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"); return true; } +// Replace target shuffle mask elements with known undef/zero sentinels. +static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, + const APInt &KnownUndef, + const APInt &KnownZero) { + unsigned NumElts = Mask.size(); + assert(KnownUndef.getBitWidth() == NumElts && + KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); + + for (unsigned i = 0; i != NumElts; ++i) { + if (KnownUndef[i]) + Mask[i] = SM_SentinelUndef; + else if (KnownZero[i]) + Mask[i] = SM_SentinelZero; + } +} + +// Extract target shuffle mask sentinel elements to known undef/zero bitmasks. +static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask, + APInt &KnownUndef, + APInt &KnownZero) { + unsigned NumElts = Mask.size(); + KnownUndef = KnownZero = APInt::getNullValue(NumElts); + + for (unsigned i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (SM_SentinelUndef == M) + KnownUndef.setBit(i); + if (SM_SentinelZero == M) + KnownZero.setBit(i); + } +} + // Forward declaration (for getFauxShuffleMask recursive check). -static bool resolveTargetShuffleInputs(SDValue Op, - SmallVectorImpl<SDValue> &Inputs, - SmallVectorImpl<int> &Mask, - SelectionDAG &DAG); +// TODO: Use DemandedElts variant. +static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + SelectionDAG &DAG, unsigned Depth, + bool ResolveKnownElts); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the @@ -6644,7 +6852,8 @@ static bool resolveTargetShuffleInputs(SDValue Op, static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, - SelectionDAG &DAG) { + SelectionDAG &DAG, unsigned Depth, + bool ResolveKnownElts) { Mask.clear(); Ops.clear(); @@ -6685,7 +6894,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Mask.push_back(SM_SentinelUndef); continue; } - uint64_t ByteBits = EltBits[i].getZExtValue(); + const APInt &ByteBits = EltBits[i]; if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); @@ -6696,8 +6905,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, case ISD::OR: { // Inspect each operand at the byte level. We can merge these into a // blend shuffle mask if for each byte at least one is masked out (zero). - KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts); - KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts); + KnownBits Known0 = + DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1); + KnownBits Known1 = + DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1); if (Known0.One.isNullValue() && Known1.One.isNullValue()) { bool IsByteMask = true; unsigned NumSizeInBytes = NumSizeInBits / 8; @@ -6736,14 +6947,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, return false; SmallVector<int, 64> SrcMask0, SrcMask1; SmallVector<SDValue, 2> SrcInputs0, SrcInputs1; - if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) || - !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG)) + if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1, + true) || + !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, + true)) return false; - int MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); + size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector<int, 64> Mask0, Mask1; scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0); scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1); - for (int i = 0; i != MaskSize; ++i) { + for (size_t i = 0; i != MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) @@ -6751,14 +6964,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, else if (Mask1[i] == SM_SentinelZero) Mask.push_back(Mask0[i]); else if (Mask0[i] == SM_SentinelZero) - Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size())); + Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size())); else return false; } - for (SDValue &Op : SrcInputs0) - Ops.push_back(Op); - for (SDValue &Op : SrcInputs1) - Ops.push_back(Op); + Ops.append(SrcInputs0.begin(), SrcInputs0.end()); + Ops.append(SrcInputs1.begin(), SrcInputs1.end()); return true; } case ISD::INSERT_SUBVECTOR: { @@ -6786,8 +6997,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). SmallVector<int, 64> SubMask; SmallVector<SDValue, 2> SubInputs; - if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, - SubMask, DAG)) + if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, + SubMask, DAG, Depth + 1, ResolveKnownElts)) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || @@ -6911,14 +7122,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // as a truncation shuffle. if (Opcode == X86ISD::PACKSS) { if ((!N0.isUndef() && - DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) || + DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || (!N1.isUndef() && - DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt)) + DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); - if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) || - (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS))) + if ((!N0.isUndef() && + !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || + (!N1.isUndef() && + !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } @@ -7061,23 +7274,45 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, Inputs = UsedInputs; } -/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs -/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the -/// remaining input indices in case we now have a unary shuffle and adjust the -/// inputs accordingly. +/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs +/// and then sets the SM_SentinelUndef and SM_SentinelZero values. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, - SmallVectorImpl<SDValue> &Inputs, - SmallVectorImpl<int> &Mask, - SelectionDAG &DAG) { +static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, + SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + APInt &KnownUndef, APInt &KnownZero, + SelectionDAG &DAG, unsigned Depth, + bool ResolveKnownElts) { + EVT VT = Op.getValueType(); + if (!VT.isSimple() || !VT.isVector()) + return false; + + if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { + if (ResolveKnownElts) + resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); + return true; + } + if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, + ResolveKnownElts)) { + resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); + return true; + } + return false; +} + +static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, + SmallVectorImpl<int> &Mask, + SelectionDAG &DAG, unsigned Depth = 0, + bool ResolveKnownElts = true) { + EVT VT = Op.getValueType(); + if (!VT.isSimple() || !VT.isVector()) + return false; + + APInt KnownUndef, KnownZero; unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnesValue(NumElts); - if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG)) - return false; - - resolveTargetShuffleInputsAndMask(Inputs, Mask); - return true; + return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, + KnownZero, DAG, Depth, ResolveKnownElts); } /// Returns the scalar element that will make up the ith @@ -7414,7 +7649,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getIntPtrConstant(InsertPSMask, DL)); + DAG.getIntPtrConstant(InsertPSMask, DL, true)); return DAG.getBitcast(VT, Result); } @@ -7427,7 +7662,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); - SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8); + SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } @@ -7439,7 +7674,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, // the shuffle mask. if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { SDValue Ptr = LD->getBasePtr(); - if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + if (!ISD::isNormalLoad(LD) || !LD->isSimple()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) @@ -7504,6 +7739,49 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, return SDValue(); } +// Recurse to find a LoadSDNode source and the accumulated ByteOffest. +static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { + if (ISD::isNON_EXTLoad(Elt.getNode())) { + auto *BaseLd = cast<LoadSDNode>(Elt); + if (!BaseLd->isSimple()) + return false; + Ld = BaseLd; + ByteOffset = 0; + return true; + } + + switch (Elt.getOpcode()) { + case ISD::BITCAST: + case ISD::TRUNCATE: + case ISD::SCALAR_TO_VECTOR: + return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); + case ISD::SRL: + if (isa<ConstantSDNode>(Elt.getOperand(1))) { + uint64_t Idx = Elt.getConstantOperandVal(1); + if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { + ByteOffset += Idx / 8; + return true; + } + } + break; + case ISD::EXTRACT_VECTOR_ELT: + if (isa<ConstantSDNode>(Elt.getOperand(1))) { + SDValue Src = Elt.getOperand(0); + unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); + unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); + if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && + findEltLoadSrc(Src, Ld, ByteOffset)) { + uint64_t Idx = Elt.getConstantOperandVal(1); + ByteOffset += Idx * (SrcSizeInBits / 8); + return true; + } + } + break; + } + + return false; +} + /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. @@ -7513,6 +7791,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { + if ((VT.getScalarSizeInBits() % 8) != 0) + return SDValue(); + unsigned NumElems = Elts.size(); int LastLoadedElt = -1; @@ -7521,6 +7802,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, APInt UndefMask = APInt::getNullValue(NumElems); SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); + SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an // undef. @@ -7539,13 +7821,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // Each loaded element must be the correct fractional portion of the // requested vector load. - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) + unsigned EltSizeInBits = Elt.getValueSizeInBits(); + if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) return SDValue(); - if (!ISD::isNON_EXTLoad(Elt.getNode())) + if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) + return SDValue(); + unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); + if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) return SDValue(); - Loads[i] = cast<LoadSDNode>(Elt); LoadMask.setBit(i); LastLoadedElt = i; } @@ -7575,6 +7860,24 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); + // TODO: Support offsetting the base load. + if (ByteOffsets[FirstLoadedElt] != 0) + return SDValue(); + + // Check to see if the element's load is consecutive to the base load + // or offset from a previous (already checked) load. + auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { + LoadSDNode *Ld = Loads[EltIdx]; + int64_t ByteOffset = ByteOffsets[EltIdx]; + if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { + int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); + return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && + Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); + } + return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, + EltIdx - FirstLoadedElt); + }; + // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. @@ -7582,8 +7885,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { - if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, - i - FirstLoadedElt)) { + if (!CheckConsecutiveLoad(LDBase, i)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; @@ -7595,8 +7897,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); - assert(!(MMOFlags & MachineMemOperand::MOVolatile) && - "Cannot merge volatile loads."); + assert(LDBase->isSimple() && + "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); @@ -7636,17 +7938,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!isAfterLegalize && VT.isVector()) { - SmallVector<int, 4> ClearMask(NumElems, -1); - for (unsigned i = 0; i < NumElems; ++i) { - if (ZeroMask[i]) - ClearMask[i] = i + NumElems; - else if (LoadMask[i]) - ClearMask[i] = i; + unsigned NumMaskElts = VT.getVectorNumElements(); + if ((NumMaskElts % NumElems) == 0) { + unsigned Scale = NumMaskElts / NumElems; + SmallVector<int, 4> ClearMask(NumMaskElts, -1); + for (unsigned i = 0; i < NumElems; ++i) { + if (UndefMask[i]) + continue; + int Offset = ZeroMask[i] ? NumMaskElts : 0; + for (unsigned j = 0; j != Scale; ++j) + ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; + } + SDValue V = CreateLoad(VT, LDBase); + SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) + : DAG.getConstantFP(0.0, DL, VT); + return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } - SDValue V = CreateLoad(VT, LDBase); - SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) - : DAG.getConstantFP(0.0, DL, VT); - return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } @@ -8194,34 +8501,10 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); - if (ISD::isBuildVectorAllZeros(Op.getNode())) - return Op; - - if (ISD::isBuildVectorAllOnes(Op.getNode())) + if (ISD::isBuildVectorAllZeros(Op.getNode()) || + ISD::isBuildVectorAllOnes(Op.getNode())) return Op; - if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { - // Split the pieces. - SDValue Lower = - DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32)); - SDValue Upper = - DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32)); - // We have to manually lower both halves so getNode doesn't try to - // reassemble the build_vector. - Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget); - Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper); - } - SDValue Imm = ConvertI1VectorToInteger(Op, DAG); - if (Imm.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, Imm); - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, - DAG.getIntPtrConstant(0, dl)); - } - - // Vector has one or more non-const elements uint64_t Immediate = 0; SmallVector<unsigned, 16> NonConstIdx; bool IsSplat = true; @@ -8244,29 +8527,40 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" - if (IsSplat) - return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), + if (IsSplat) { + // The build_vector allows the scalar element to be larger than the vector + // element type. We need to mask it to use as a condition unless we know + // the upper bits are zero. + // FIXME: Use computeKnownBits instead of checking specific opcode? + SDValue Cond = Op.getOperand(SplatIdx); + assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!"); + if (Cond.getOpcode() != ISD::SETCC) + Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, + DAG.getConstant(1, dl, MVT::i8)); + return DAG.getSelect(dl, VT, Cond, DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); + } // insert elements one by one SDValue DstVec; - SDValue Imm; - if (Immediate) { - MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); - Imm = DAG.getConstant(Immediate, dl, ImmVT); - } - else if (HasConstElts) - Imm = DAG.getConstant(0, dl, VT); - else - Imm = DAG.getUNDEF(VT); - if (Imm.getValueSizeInBits() == VT.getSizeInBits()) - DstVec = DAG.getBitcast(VT, Imm); - else { - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); - DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, - DAG.getIntPtrConstant(0, dl)); - } + if (HasConstElts) { + if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { + SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); + SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); + ImmL = DAG.getBitcast(MVT::v32i1, ImmL); + ImmH = DAG.getBitcast(MVT::v32i1, ImmH); + DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); + } else { + MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U)); + SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); + MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; + DstVec = DAG.getBitcast(VecVT, Imm); + DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, + DAG.getIntPtrConstant(0, dl)); + } + } else + DstVec = DAG.getUNDEF(VT); for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { unsigned InsertIdx = NonConstIdx[i]; @@ -8757,7 +9051,7 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, // If we don't need the upper xmm, then perform as a xmm hop. unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { - MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); @@ -8965,21 +9259,14 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. - if (ISD::isBuildVectorAllZeros(Op.getNode())) { - // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd - // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. - if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) - return Op; - - return getZeroVector(VT, Subtarget, DAG, DL); - } + if (ISD::isBuildVectorAllZeros(Op.getNode())) + return Op; // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { - if (VT == MVT::v4i32 || VT == MVT::v16i32 || - (VT == MVT::v8i32 && Subtarget.hasInt256())) + if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getOnesVector(VT, DAG, DL); @@ -9150,9 +9437,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); if (Subtarget.hasXOP()) - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, - LoLo, HiHi, IndicesVec, - DAG.getConstant(0, DL, MVT::i8))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, + IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPS only uses index bits[0:1] to permute elements. SDValue Res = DAG.getSelectCC( @@ -9186,9 +9473,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); if (Subtarget.hasXOP()) - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, - LoLo, HiHi, IndicesVec, - DAG.getConstant(0, DL, MVT::i8))); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, + IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPD only uses index bit[1] to permute elements. SDValue Res = DAG.getSelectCC( @@ -9283,7 +9570,7 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, return SDValue(); auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); - if (!PermIdx || PermIdx->getZExtValue() != Idx) + if (!PermIdx || PermIdx->getAPIntValue() != Idx) return SDValue(); } @@ -9434,23 +9721,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); - if (VT.getSizeInBits() >= 256) { - MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); - if (Subtarget.hasAVX()) { - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); - } else { - // Without AVX, we need to extend to a 128-bit vector and then - // insert into the 256-bit vector. - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl); - Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl); - } - } else { - assert(VT.is128BitVector() && "Expected an SSE value type!"); - Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); - Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); - } + MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); return DAG.getBitcast(VT, Item); } } @@ -9549,8 +9822,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. - return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl, - VT.getSizeInBits() / 2); + return concatSubVectors(Lower, Upper, DAG, dl); } // Let legalizer expand 2-wide build_vectors. @@ -9703,8 +9975,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, // If we have more than 2 non-zeros, build each half separately. if (NumNonZero > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef<SDUse> Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); @@ -9745,30 +10016,47 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); - unsigned NumZero = 0; - unsigned NumNonZero = 0; + uint64_t Zeros = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; + assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) - ++NumZero; - else { - assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. + Zeros |= (uint64_t)1 << i; + else NonZeros |= (uint64_t)1 << i; - ++NumNonZero; - } } + unsigned NumElems = ResVT.getVectorNumElements(); + + // If we are inserting non-zero vector and there are zeros in LSBs and undef + // in the MSBs we need to emit a KSHIFTL. The generic lowering to + // insert_subvector will give us two kshifts. + if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && + Log2_64(NonZeros) != NumOperands - 1) { + MVT ShiftVT = ResVT; + if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) + ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + unsigned Idx = Log2_64(NonZeros); + SDValue SubVec = Op.getOperand(Idx); + unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT, + DAG.getUNDEF(ShiftVT), SubVec, + DAG.getIntPtrConstant(0, dl)); + Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec, + DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, + DAG.getIntPtrConstant(0, dl)); + } // If there are zero or one non-zeros we can handle this very simply. - if (NumNonZero <= 1) { - SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) - : DAG.getUNDEF(ResVT); - if (!NumNonZero) + if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { + SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); + if (!NonZeros) return Vec; - unsigned Idx = countTrailingZeros(NonZeros); + unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, @@ -9776,8 +10064,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, } if (NumOperands > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef<SDUse> Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); @@ -9786,7 +10073,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } - assert(NumNonZero == 2 && "Simple cases not handled?"); + assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK @@ -9794,7 +10081,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getIntPtrConstant(0, dl)); - unsigned NumElems = ResVT.getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getIntPtrConstant(NumElems/2, dl)); } @@ -9997,42 +10283,44 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask, /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// -/// SM_SentinelZero is accepted as a valid negative index but must match in both. +/// SM_SentinelZero is accepted as a valid negative index but must match in +/// both. static bool isTargetShuffleEquivalent(ArrayRef<int> Mask, - ArrayRef<int> ExpectedMask) { + ArrayRef<int> ExpectedMask, + SDValue V1 = SDValue(), + SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && "Illegal target shuffle mask"); - for (int i = 0; i < Size; ++i) - if (Mask[i] == SM_SentinelUndef) - continue; - else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero) - return false; - else if (Mask[i] != ExpectedMask[i]) - return false; - - return true; -} + // Check for out-of-range target shuffle mask indices. + if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) + return false; -// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle -// mask. -static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask, - const APInt &Zeroable) { - int NumElts = Mask.size(); - assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes"); + // If the values are build vectors, we can look through them to find + // equivalent inputs that make the shuffles equivalent. + auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1); + auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2); + BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1); + BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2); - SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef); - for (int i = 0; i != NumElts; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) + for (int i = 0; i < Size; ++i) { + if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i]) continue; - assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index"); - TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M); + if (0 <= Mask[i] && 0 <= ExpectedMask[i]) { + auto *MaskBV = Mask[i] < Size ? BV1 : BV2; + auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; + if (MaskBV && ExpectedBV && + MaskBV->getOperand(Mask[i] % Size) == + ExpectedBV->getOperand(ExpectedMask[i] % Size)) + continue; + } + // TODO - handle SM_Sentinel equivalences. + return false; } - return TargetMask; + return true; } // Attempt to create a shuffle mask from a VSELECT condition mask. @@ -10133,7 +10421,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, SelectionDAG &DAG) { - return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); + return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } /// Compute whether each element of a shuffle is zeroable. @@ -10573,14 +10861,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, // Try binary shuffle. SmallVector<int, 32> BinaryMask; createPackShuffleMask(VT, BinaryMask, false); - if (isTargetShuffleEquivalent(TargetMask, BinaryMask)) + if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2)) if (MatchPACK(V1, V2)) return true; // Try unary shuffle. SmallVector<int, 32> UnaryMask; createPackShuffleMask(VT, UnaryMask, true); - if (isTargetShuffleEquivalent(TargetMask, UnaryMask)) + if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1)) if (MatchPACK(V1, V1)) return true; @@ -10685,9 +10973,9 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SelectionDAG &DAG); static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, - MutableArrayRef<int> TargetMask, - bool &ForceV1Zero, bool &ForceV2Zero, - uint64_t &BlendMask) { + MutableArrayRef<int> Mask, + const APInt &Zeroable, bool &ForceV1Zero, + bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = @@ -10695,13 +10983,12 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; - assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask"); + assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. - // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - int M = TargetMask[i]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; if (M == SM_SentinelUndef) continue; if (M == i) @@ -10710,16 +10997,16 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, BlendMask |= 1ull << i; continue; } - if (M == SM_SentinelZero) { + if (Zeroable[i]) { if (V1IsZeroOrUndef) { ForceV1Zero = true; - TargetMask[i] = i; + Mask[i] = i; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; BlendMask |= 1ull << i; - TargetMask[i] = i + Size; + Mask[i] = i + Size; continue; } } @@ -10748,11 +11035,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable); - uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; - if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero, + SmallVector<int, 64> Mask(Original.begin(), Original.end()); + if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); @@ -10778,7 +11064,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, case MVT::v8i16: assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); + DAG.getTargetConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector<int, 8> RepeatedMask; @@ -10790,7 +11076,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(BlendMask, DL, MVT::i8)); + DAG.getTargetConstant(BlendMask, DL, MVT::i8)); } // Use PBLENDW for lower/upper lanes and then blend lanes. // TODO - we should allow 2 PBLENDW here and leave shuffle combine to @@ -10799,9 +11085,9 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, uint64_t HiMask = (BlendMask >> 8) & 0xFF; if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(LoMask, DL, MVT::i8)); + DAG.getTargetConstant(LoMask, DL, MVT::i8)); SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, - DAG.getConstant(HiMask, DL, MVT::i8)); + DAG.getTargetConstant(HiMask, DL, MVT::i8)); return DAG.getVectorShuffle( MVT::v16i16, DL, Lo, Hi, {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); @@ -11061,7 +11347,7 @@ static SDValue lowerShuffleAsByteRotateAndPermute( SDValue Rotate = DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), DAG.getBitcast(ByteVT, Lo), - DAG.getConstant(Scale * RotAmt, DL, MVT::i8))); + DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { @@ -11268,7 +11554,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, - DAG.getConstant(ByteRotation, DL, MVT::i8))); + DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && @@ -11282,10 +11568,12 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; - SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, - DAG.getConstant(LoByteShift, DL, MVT::i8)); - SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, - DAG.getConstant(HiByteShift, DL, MVT::i8)); + SDValue LoShift = + DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, + DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); + SDValue HiShift = + DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, + DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } @@ -11317,7 +11605,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, return SDValue(); return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, - DAG.getConstant(Rotation, DL, MVT::i8)); + DAG.getTargetConstant(Rotation, DL, MVT::i8)); } /// Try to lower a vector shuffle as a byte shift sequence. @@ -11356,27 +11644,27 @@ static SDValue lowerVectorShuffleAsByteShiftMask( if (ZeroLo == 0) { unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); + DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); } else if (ZeroHi == 0) { unsigned Shift = Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else if (!Subtarget.hasSSSE3()) { // If we don't have PSHUFB then its worth avoiding an AND constant mask // by performing 3 byte shifts. Shuffle combining can kick in above that. // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Shift += Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * Shift, DL, MVT::i8)); + DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, - DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else return SDValue(); @@ -11498,7 +11786,7 @@ static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i8)); + DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } @@ -11632,14 +11920,14 @@ static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return SDValue(); } @@ -11686,9 +11974,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; - // Found a valid zext mask! Try various lowering strategies based on the + // Found a valid a/zext mask! Try various lowering strategies based on the // input type and available ISA extensions. - // TODO: Add AnyExt support. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. @@ -11697,7 +11984,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL, + ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -11736,8 +12024,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(LoIdx, DL, MVT::i8))); + DAG.getTargetConstant(EltBits, DL, MVT::i8), + DAG.getTargetConstant(LoIdx, DL, MVT::i8))); if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); @@ -11745,8 +12033,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(HiIdx, DL, MVT::i8))); + DAG.getTargetConstant(EltBits, DL, MVT::i8), + DAG.getTargetConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } @@ -11759,8 +12047,12 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); - PSHUFBMask[i] = DAG.getConstant( - (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); + if ((i % Scale == 0 && SafeOffset(Idx))) { + PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8); + continue; + } + PSHUFBMask[i] = + AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( @@ -12052,9 +12344,9 @@ static SDValue lowerShuffleAsElementInsertion( V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); - V2 = DAG.getNode( - X86ISD::VSHLDQ, DL, MVT::v16i8, V2, - DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8)); + V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2, + DAG.getTargetConstant( + V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8)); V2 = DAG.getBitcast(VT, V2); } } @@ -12294,7 +12586,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { + } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) { // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); @@ -12486,7 +12778,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } /// Try to lower a shuffle as a permute of the inputs followed by an @@ -12635,14 +12927,14 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"); @@ -12688,7 +12980,7 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, - DAG.getConstant(SHUFPDMask, DL, MVT::i8)); + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit integer shuffles. @@ -12996,10 +13288,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) - return Broadcast; + // Try to use broadcast unless the mask only has one non-undef element. + if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + } // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. @@ -13680,16 +13974,16 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG)) - return Broadcast; - // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG)) return Shift; + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; @@ -13984,8 +14278,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. + bool EvenInUse = false, OddInUse = false; + for (int i = 0; i < 16; i += 2) { + EvenInUse |= (Mask[i + 0] >= 0); + OddInUse |= (Mask[i + 1] >= 0); + if (EvenInUse && OddInUse) + break; + } V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, - MVT::v16i8, V1, V1); + MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8), + OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8)); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; ++i) @@ -14100,11 +14402,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); - // We use the mask type to pick which bytes are preserved based on how many - // elements are dropped. - MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; - SDValue ByteClearMask = DAG.getBitcast( - MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1])); + SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8)); + for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops) + ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8); + SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps); V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); if (!IsSingleInput) V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); @@ -14448,16 +14749,14 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); } -/// Lower a vector shuffle crossing multiple 128-bit lanes as -/// a permutation and blend of those lanes. +/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one +/// source with a lane permutation. /// -/// This essentially blends the out-of-lane inputs to each lane into the lane -/// from a permuted copy of the vector. This lowering strategy results in four -/// instructions in the worst case for a single-input cross lane shuffle which -/// is lower than any other fully general cross-lane shuffle strategy I'm aware -/// of. Special cases for each particular shuffle pattern should be handled -/// prior to trying this lowering. -static SDValue lowerShuffleAsLanePermuteAndBlend( +/// This lowering strategy results in four instructions in the worst case for a +/// single-input cross lane shuffle which is lower than any other fully general +/// cross-lane shuffle strategy I'm aware of. Special cases for each particular +/// shuffle pattern should be handled prior to trying this lowering. +static SDValue lowerShuffleAsLanePermuteAndShuffle( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. @@ -14484,24 +14783,28 @@ static SDValue lowerShuffleAsLanePermuteAndBlend( return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } + // TODO - we could support shuffling V2 in the Flipped input. assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); - SmallVector<int, 32> FlippedBlendMask(Size); - for (int i = 0; i < Size; ++i) - FlippedBlendMask[i] = - Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) - ? Mask[i] - : Mask[i] % LaneSize + - (i / LaneSize) * LaneSize + Size); + SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end()); + for (int i = 0; i < Size; ++i) { + int &M = InLaneMask[i]; + if (M < 0) + continue; + if (((M % Size) / LaneSize) != (i / LaneSize)) + M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; + } + assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && + "In-lane shuffle mask expected"); - // Flip the vector, and blend the results which should now be in-lane. + // Flip the lanes, and shuffle the results which should now be in-lane. MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; SDValue Flipped = DAG.getBitcast(PVT, V1); - Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), - { 2, 3, 0, 1 }); + Flipped = + DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1}); Flipped = DAG.getBitcast(VT, Flipped); - return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask); } /// Handle lowering 2-lane 128-bit shuffles. @@ -14565,8 +14868,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { unsigned PermMask = ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1); - return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, - DAG.getConstant(PermMask, DL, MVT::i8)); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } } } @@ -14598,7 +14901,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, V2 = DAG.getUNDEF(VT); return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, - DAG.getConstant(PermMask, DL, MVT::i8)); + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Lower a vector shuffle by first fixing the 128-bit lanes and then @@ -14616,26 +14919,26 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( if (is128BitLaneRepeatedShuffleMask(VT, Mask)) return SDValue(); - int Size = Mask.size(); + int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; - int LaneSize = 128 / VT.getScalarSizeInBits(); - SmallVector<int, 16> RepeatMask(LaneSize, -1); + int NumLaneElts = 128 / VT.getScalarSizeInBits(); + SmallVector<int, 16> RepeatMask(NumLaneElts, -1); SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}}); // First pass will try to fill in the RepeatMask from lanes that need two // sources. for (int Lane = 0; Lane != NumLanes; ++Lane) { - int Srcs[2] = { -1, -1 }; - SmallVector<int, 16> InLaneMask(LaneSize, -1); - for (int i = 0; i != LaneSize; ++i) { - int M = Mask[(Lane * LaneSize) + i]; + int Srcs[2] = {-1, -1}; + SmallVector<int, 16> InLaneMask(NumLaneElts, -1); + for (int i = 0; i != NumLaneElts; ++i) { + int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // Determine which of the possible input lanes (NumLanes from each source) // this element comes from. Assign that as one of the sources for this // lane. We can assign up to 2 sources for this lane. If we run out // sources we can't do anything. - int LaneSrc = M / LaneSize; + int LaneSrc = M / NumLaneElts; int Src; if (Srcs[0] < 0 || Srcs[0] == LaneSrc) Src = 0; @@ -14645,7 +14948,7 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( return SDValue(); Srcs[Src] = LaneSrc; - InLaneMask[i] = (M % LaneSize) + Src * Size; + InLaneMask[i] = (M % NumLaneElts) + Src * NumElts; } // If this lane has two sources, see if it fits with the repeat mask so far. @@ -14701,23 +15004,23 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( if (LaneSrcs[Lane][0] >= 0) continue; - for (int i = 0; i != LaneSize; ++i) { - int M = Mask[(Lane * LaneSize) + i]; + for (int i = 0; i != NumLaneElts; ++i) { + int M = Mask[(Lane * NumLaneElts) + i]; if (M < 0) continue; // If RepeatMask isn't defined yet we can define it ourself. if (RepeatMask[i] < 0) - RepeatMask[i] = M % LaneSize; + RepeatMask[i] = M % NumLaneElts; - if (RepeatMask[i] < Size) { - if (RepeatMask[i] != M % LaneSize) + if (RepeatMask[i] < NumElts) { + if (RepeatMask[i] != M % NumLaneElts) return SDValue(); - LaneSrcs[Lane][0] = M / LaneSize; + LaneSrcs[Lane][0] = M / NumLaneElts; } else { - if (RepeatMask[i] != ((M % LaneSize) + Size)) + if (RepeatMask[i] != ((M % NumLaneElts) + NumElts)) return SDValue(); - LaneSrcs[Lane][1] = M / LaneSize; + LaneSrcs[Lane][1] = M / NumLaneElts; } } @@ -14725,14 +15028,14 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( return SDValue(); } - SmallVector<int, 16> NewMask(Size, -1); + SmallVector<int, 16> NewMask(NumElts, -1); for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][0]; - for (int i = 0; i != LaneSize; ++i) { + for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) - M = Src * LaneSize + i; - NewMask[Lane * LaneSize + i] = M; + M = Src * NumLaneElts + i; + NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); @@ -14745,11 +15048,11 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( for (int Lane = 0; Lane != NumLanes; ++Lane) { int Src = LaneSrcs[Lane][1]; - for (int i = 0; i != LaneSize; ++i) { + for (int i = 0; i != NumLaneElts; ++i) { int M = -1; if (Src >= 0) - M = Src * LaneSize + i; - NewMask[Lane * LaneSize + i] = M; + M = Src * NumLaneElts + i; + NewMask[Lane * NumLaneElts + i] = M; } } SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); @@ -14760,12 +15063,12 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask) return SDValue(); - for (int i = 0; i != Size; ++i) { - NewMask[i] = RepeatMask[i % LaneSize]; + for (int i = 0; i != NumElts; ++i) { + NewMask[i] = RepeatMask[i % NumLaneElts]; if (NewMask[i] < 0) continue; - NewMask[i] += (i / LaneSize) * LaneSize; + NewMask[i] += (i / NumLaneElts) * NumLaneElts; } return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); } @@ -14831,14 +15134,13 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask, static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef<int> HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, - SelectionDAG &DAG) { + SelectionDAG &DAG, bool UseConcat = false) { assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?"); assert(V1.getValueType().isSimple() && "Expecting only simple types"); MVT VT = V1.getSimpleValueType(); - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + unsigned HalfNumElts = HalfVT.getVectorNumElements(); auto getHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) @@ -14853,6 +15155,14 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, SDValue Half1 = getHalfVector(HalfIdx1); SDValue Half2 = getHalfVector(HalfIdx2); SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + if (UseConcat) { + SDValue Op0 = V; + SDValue Op1 = DAG.getUNDEF(HalfVT); + if (UndefLower) + std::swap(Op0, Op1); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1); + } + unsigned Offset = UndefLower ? HalfNumElts : 0; return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, DAG.getIntPtrConstant(Offset, DL)); @@ -14877,9 +15187,8 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, // Upper half is undef and lower half is whole upper subvector. // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - unsigned NumElts = VT.getVectorNumElements(); - unsigned HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + unsigned HalfNumElts = HalfVT.getVectorNumElements(); if (!UndefLower && isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, @@ -15155,11 +15464,19 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( } static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, - unsigned &ShuffleImm, ArrayRef<int> Mask) { + bool &ForceV1Zero, bool &ForceV2Zero, + unsigned &ShuffleImm, ArrayRef<int> Mask, + const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"); + assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && + "Illegal shuffle mask"); + + bool ZeroLane[2] = { true, true }; + for (int i = 0; i < NumElts; ++i) + ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. @@ -15167,7 +15484,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { - if (Mask[i] == SM_SentinelUndef) + if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1]) continue; if (Mask[i] < 0) return false; @@ -15180,30 +15497,77 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, ShuffleImm |= (Mask[i] % 2) << i; } - if (ShufpdMask) - return true; - if (CommutableMask) { + if (!ShufpdMask && !CommutableMask) + return false; + + if (!ShufpdMask && CommutableMask) std::swap(V1, V2); - return true; - } - return false; + ForceV1Zero = ZeroLane[0]; + ForceV2Zero = ZeroLane[1]; + return true; } -static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&& +static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); unsigned Immediate = 0; - if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask)) + bool ForceV1Zero = false, ForceV2Zero = false; + if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, + Mask, Zeroable)) return SDValue(); + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, - DAG.getConstant(Immediate, DL, MVT::i8)); + DAG.getTargetConstant(Immediate, DL, MVT::i8)); } +// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed +// by zeroable elements in the remaining 24 elements. Turn this into two +// vmovqb instructions shuffled together. +static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + assert(VT == MVT::v32i8 && "Unexpected type!"); + + // The first 8 indices should be every 8th element. + if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) + return SDValue(); + + // Remaining elements need to be zeroable. + if (Zeroable.countLeadingOnes() < (Mask.size() - 8)) + return SDValue(); + + V1 = DAG.getBitcast(MVT::v4i64, V1); + V2 = DAG.getBitcast(MVT::v4i64, V2); + + V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); + V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); + + // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in + // the upper bits of the result using an unpckldq. + SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, + { 0, 1, 2, 3, 16, 17, 18, 19, + 4, 5, 6, 7, 20, 21, 22, 23 }); + // Insert the unpckldq into a zero vector to widen to v32i8. + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, + DAG.getConstant(0, DL, MVT::v32i8), Unpack, + DAG.getIntPtrConstant(0, DL)); +} + + /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -15236,7 +15600,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, - DAG.getConstant(VPERMILPMask, DL, MVT::i8)); + DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } // With AVX2 we have direct support for this permutation. @@ -15256,8 +15620,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; // Otherwise, fall back. - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, - Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask, + DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. @@ -15269,7 +15633,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Blend; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Op; // If we have one input in place, then we can permute the other input and @@ -15473,8 +15838,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); // Otherwise, fall back. - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -15681,8 +16046,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask, - DAG, Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask, + DAG, Subtarget); } SmallVector<int, 8> RepeatedMask; @@ -15780,8 +16145,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; - return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG, - Subtarget); + return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, + DAG, Subtarget); } if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, @@ -15803,6 +16168,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; + // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed + // by zeroable elements in the remaining 24 elements. Turn this into two + // vmovqb instructions shuffled together. + if (Subtarget.hasVLX()) + if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, + Mask, Zeroable, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); @@ -15974,7 +16347,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], - DAG.getConstant(PermMask, DL, MVT::i8)); + DAG.getTargetConstant(PermMask, DL, MVT::i8)); } /// Handle lowering of 8-lane 64-bit floating point shuffles. @@ -15999,7 +16372,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, - DAG.getConstant(VPERMILPMask, DL, MVT::i8)); + DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); } SmallVector<int, 4> RepeatedMask; @@ -16016,7 +16389,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Unpck; // Check if the blend happens to exactly fit that of SHUFPD. - if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG)) + if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) return Op; if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, @@ -16389,6 +16763,49 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, } } +static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + // Shuffle should be unary. + if (!V2.isUndef()) + return SDValue(); + + int ShiftAmt = -1; + int NumElts = Mask.size(); + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) && + "Unexpected mask index."); + if (M < 0) + continue; + + // The first non-undef element determines our shift amount. + if (ShiftAmt < 0) { + ShiftAmt = M - i; + // Need to be shifting right. + if (ShiftAmt <= 0) + return SDValue(); + } + // All non-undef elements must shift by the same amount. + if (ShiftAmt != M - i) + return SDValue(); + } + assert(ShiftAmt >= 0 && "All undef?"); + + // Great we found a shift right. + MVT WideVT = VT; + if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) + WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, + DAG.getUNDEF(WideVT), V1, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res, + DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); +} + // Determine if this shuffle can be implemented with a KSHIFT instruction. // Returns the shift amount if possible or -1 if not. This is a simplified // version of matchShuffleAsShift. @@ -16434,13 +16851,20 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"); - unsigned NumElts = Mask.size(); + int NumElts = Mask.size(); // Try to recognize shuffles that are just padding a subvector with zeros. - unsigned SubvecElts = 0; - for (int i = 0; i != (int)NumElts; ++i) { - if (Mask[i] >= 0 && Mask[i] != i) - break; + int SubvecElts = 0; + int Src = -1; + for (int i = 0; i != NumElts; ++i) { + if (Mask[i] >= 0) { + // Grab the source from the first valid mask. All subsequent elements need + // to use this same source. + if (Src < 0) + Src = Mask[i] / NumElts; + if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i) + break; + } ++SubvecElts; } @@ -16451,30 +16875,54 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Make sure the number of zeroable bits in the top at least covers the bits // not covered by the subvector. - if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { + if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) { + assert(Src >= 0 && "Expected a source!"); MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, - V1, DAG.getIntPtrConstant(0, DL)); + Src == 0 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - getZeroVector(VT, Subtarget, DAG, DL), + DAG.getConstant(0, DL, VT), Extract, DAG.getIntPtrConstant(0, DL)); } + // Try a simple shift right with undef elements. Later we'll try with zeros. + if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, + DAG)) + return Shift; + // Try to match KSHIFTs. - // TODO: Support narrower than legal shifts by widening and extracting. - if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) { - unsigned Offset = 0; - for (SDValue V : { V1, V2 }) { - unsigned Opcode; - int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); - if (ShiftAmt >= 0) - return DAG.getNode(Opcode, DL, VT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i8)); - Offset += NumElts; // Increment for next iteration. + unsigned Offset = 0; + for (SDValue V : { V1, V2 }) { + unsigned Opcode; + int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); + if (ShiftAmt >= 0) { + MVT WideVT = VT; + if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) + WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, + DAG.getUNDEF(WideVT), V, + DAG.getIntPtrConstant(0, DL)); + // Widened right shifts need two shifts to ensure we shift in zeroes. + if (Opcode == X86ISD::KSHIFTR && WideVT != VT) { + int WideElts = WideVT.getVectorNumElements(); + // Shift left to put the original vector in the MSBs of the new size. + Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res, + DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8)); + // Increase the shift amount to account for the left shift. + ShiftAmt += WideElts - NumElts; + } + + Res = DAG.getNode(Opcode, DL, WideVT, Res, + DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); } + Offset += NumElts; // Increment for next iteration. } + MVT ExtVT; switch (VT.SimpleTy) { default: @@ -16594,7 +17042,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); + ArrayRef<int> OrigMask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); @@ -16620,8 +17068,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef && - any_of(Mask, [NumElements](int M) { return M >= NumElements; })) { - SmallVector<int, 8> NewMask(Mask.begin(), Mask.end()); + any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { + SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; @@ -16629,15 +17077,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, } // Check for illegal shuffle mask element index values. - int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit; - assert(llvm::all_of(Mask, + int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); + (void)MaskUpperLimit; + assert(llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index"); // We actually see shuffles that are entirely re-arrangements of a set of // zero inputs. This mostly happens while decomposing complex shuffles into // simple ones. Directly lower these as a buildvector of zeros. - APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2); if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); @@ -16645,11 +17094,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. - SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end()); + SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end()); if (V2IsZero) { assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); for (int i = 0; i != NumElements; ++i) - if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } @@ -16664,7 +17113,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG)) return Broadcast; @@ -16700,8 +17149,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, } // Commute the shuffle if it will improve canonicalization. - if (canonicalizeShuffleMaskWithCommute(Mask)) - return DAG.getCommutedVectorShuffle(*SVOp); + SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end()); + if (canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget)) return V; @@ -16910,7 +17362,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); @@ -17137,8 +17589,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && EltVT == MVT::i32)) { SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); - N2 = DAG.getIntPtrConstant(1, dl); - return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, + DAG.getTargetConstant(1, dl, MVT::i8)); } } @@ -17207,14 +17659,14 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // But if optimizing for size and there's a load folding opportunity, // generate insertps because blendps does not have a 32-bit memory // operand form. - N2 = DAG.getIntPtrConstant(1, dl); N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2); + return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, + DAG.getTargetConstant(1, dl, MVT::i8)); } - N2 = DAG.getIntPtrConstant(IdxVal << 4, dl); // Create this as a scalar to vector.. N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, + DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8)); } // PINSR* works with constant index. @@ -17300,7 +17752,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, // Shift to the LSB. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, - DAG.getConstant(IdxVal, dl, MVT::i8)); + DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, DAG.getIntPtrConstant(0, dl)); @@ -17841,10 +18293,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, std::swap(Op0, Op1); APInt APIntShiftAmt; - if (isConstantSplat(Amt, APIntShiftAmt)) { + if (X86::isConstantSplat(Amt, APIntShiftAmt)) { uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits()); - return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, - Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8)); + return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0, + Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); } return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, @@ -17970,6 +18422,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (VT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; @@ -18072,6 +18527,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, return Result; } +/// Horizontal vector math instructions may be slower than normal math with +/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch +/// implementation, and likely shuffle complexity of the alternate sequence. +static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool HasFastHOps = Subtarget.hasFastHorizontalOps(); + return !IsSingleSource || IsOptimizingSize || HasFastHOps; +} + /// 64-bit unsigned integer to double expansion. static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -18126,8 +18591,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; - if (Subtarget.hasSSE3()) { - // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'. + if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) { Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); } else { SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); @@ -18273,7 +18737,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // Low will be bitcasted right away, so do not bother bitcasting back to its // original type. Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, - VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); + VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), // (uint4) 0x53000000, 0xaa); SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); @@ -18281,7 +18745,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // High will be bitcasted right away, so do not bother bitcasting back to // its original type. High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, - VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32)); + VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); } else { SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; @@ -18329,16 +18793,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue N0 = Op.getOperand(0); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); + MVT SrcVT = N0.getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); - if (Op.getSimpleValueType().isVector()) + if (DstVT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); + + if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; - MVT SrcVT = N0.getSimpleValueType(); - MVT DstVT = Op.getSimpleValueType(); - if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, @@ -18346,6 +18812,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return Op; } + // Promote i32 to i64 and use a signed conversion on 64-bit targets. + if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { + N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0); + return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0); + } + if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) return V; @@ -18579,7 +19051,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) + if (VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -18602,10 +19074,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. // Concat upper and lower parts. // - - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - + MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); // Short-circuit if we can determine that each 128-bit half is the same value. @@ -18903,9 +19372,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // If called by the legalizer just return. - if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT)) + // If we're called by the type legalizer, handle a few cases. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(InVT)) { + if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && + VT.is128BitVector()) { + assert(Subtarget.hasVLX() && "Unexpected subtarget!"); + // The default behavior is to truncate one step, concatenate, and then + // truncate the remainder. We'd rather produce two 64-bit results and + // concatenate those. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, DL); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo); + Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); + } + + // Otherwise let default legalization handle it. return SDValue(); + } if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); @@ -18940,6 +19429,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) return V; + // Handle truncation of V256 to V128 using shuffles. + assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); + if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget.hasInt256()) { @@ -19016,22 +19508,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi); } - // Handle truncation of V256 to V128 using shuffles. - assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!"); - - assert(Subtarget.hasAVX() && "256-bit vector without AVX!"); - - unsigned NumElems = VT.getVectorNumElements(); - MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2); - - SmallVector<int, 16> MaskVec(NumElems * 2, -1); - // Prepare truncation shuffle mask - for (unsigned i = 0; i != NumElems; ++i) - MaskVec[i] = i * 2; - In = DAG.getBitcast(NVT, In); - SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, - DAG.getIntPtrConstant(0, DL)); + llvm_unreachable("All 256->128 cases should have been handled above!"); } SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { @@ -19041,6 +19518,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); + if (SrcVT == MVT::f128) { + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVT, VT); + else + LC = RTLIB::getFPTOUINT(SrcVT, VT); + + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first; + } + if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; @@ -19075,14 +19563,27 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); - if (!IsSigned && Subtarget.hasAVX512()) { - // Conversions from f32/f64 should be legal. - if (UseSSEReg) + if (!IsSigned && UseSSEReg) { + // Conversions from f32/f64 with AVX512 should be legal. + if (Subtarget.hasAVX512()) return Op; - // Use default expansion. + // Use default expansion for i64. if (VT == MVT::i64) return SDValue(); + + assert(VT == MVT::i32 && "Unexpected VT!"); + + // Promote i32 to i64 and use a signed operation on 64-bit targets. + if (Subtarget.is64Bit()) { + SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + } + + // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can + // use fisttp which will be handled later. + if (!Subtarget.hasSSE3()) + return SDValue(); } // Promote i16 to i32 if we can use a SSE operation. @@ -19103,12 +19604,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } -static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); + if (VT == MVT::f128) { + RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); + return LowerF128Call(Op, DAG, LC); + } + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, @@ -19116,14 +19622,31 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -/// Horizontal vector math instructions may be slower than normal math with -/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch -/// implementation, and likely shuffle complexity of the alternate sequence. -static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize(); - bool HasFastHOps = Subtarget.hasFastHorizontalOps(); - return !IsSingleSource || IsOptimizingSize || HasFastHOps; +SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getSimpleValueType(); + + // It's legal except when f128 is involved + if (SVT != MVT::f128) + return Op; + + RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT); + + // FP_ROUND node has a second operand indicating whether it is known to be + // precise. That doesn't take part in the LibCall so we can't directly use + // LowerF128Call. + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first; +} + +// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking +// the default expansion of STRICT_FP_ROUND. +static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) { + // FIXME: Need to form a libcall with an input chain for f128. + assert(Op.getOperand(0).getValueType() != MVT::f128 && + "Don't know how to handle f128 yet!"); + return Op; } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -19200,8 +19723,13 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. -static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::f128) { + RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128 + : RTLIB::SUB_F128; + return LowerF128Call(Op, DAG, LC); + } + assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); @@ -19358,13 +19886,13 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, SelectionDAG &DAG) { return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(Cond, dl, MVT::i8), EFLAGS); + DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS); } /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) /// style scalarized (associative) reduction patterns. -static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp, - SmallVectorImpl<SDValue> &SrcOps) { +static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, + SmallVectorImpl<SDValue> &SrcOps) { SmallVector<SDValue, 8> Opnds; DenseMap<SDValue, APInt> SrcOpMap; EVT VT = MVT::Other; @@ -19437,7 +19965,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, return SDValue(); SmallVector<SDValue, 8> VecIns; - if (!matchBitOpReduction(Op, ISD::OR, VecIns)) + if (!matchScalarReduction(Op, ISD::OR, VecIns)) return SDValue(); // Quit if not 128/256-bit vector. @@ -19461,8 +19989,8 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); } - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, - MVT::i8); + X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, + DL, MVT::i8); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); } @@ -19576,6 +20104,13 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case X86ISD::XOR: case X86ISD::AND: return SDValue(Op.getNode(), 1); + case ISD::SSUBO: + case ISD::USUBO: { + // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag. + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), + Op->getOperand(1)).getValue(1); + } default: default_case: break; @@ -19766,6 +20301,63 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; } +SDValue +X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, + SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const { + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isIntDivCheap(N->getValueType(0), Attr)) + return SDValue(N,0); // Lower SDIV as SDIV + + assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && + "Unexpected divisor!"); + + // Only perform this transform if CMOV is supported otherwise the select + // below will become a branch. + if (!Subtarget.hasCMov()) + return SDValue(); + + // fold (sdiv X, pow2) + EVT VT = N->getValueType(0); + // FIXME: Support i8. + if (VT != MVT::i16 && VT != MVT::i32 && + !(Subtarget.is64Bit() && VT == MVT::i64)) + return SDValue(); + + unsigned Lg2 = Divisor.countTrailingZeros(); + + // If the divisor is 2 or -2, the default expansion is better. + if (Lg2 == 1) + return SDValue(); + + SDLoc DL(N); + SDValue N0 = N->getOperand(0); + SDValue Zero = DAG.getConstant(0, DL, VT); + APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2); + SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT); + + // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right. + SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); + SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0); + + Created.push_back(Cmp.getNode()); + Created.push_back(Add.getNode()); + Created.push_back(CMov.getNode()); + + // Divide by pow2. + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64)); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (Divisor.isNonNegative()) + return SRA; + + Created.push_back(SRA.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA); +} + /// Result of 'and' is compared against zero. Change to a BT node if possible. /// Returns the BT node and the condition code needed to use it. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, @@ -19842,8 +20434,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, if (Src.getValueType() != BitNo.getValueType()) BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo); - X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, - dl, MVT::i8); + X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B, + dl, MVT::i8); return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo); } @@ -19935,13 +20527,6 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); - // If this is a seteq make sure any build vectors of all zeros are on the RHS. - // This helps with vptestm matching. - // TODO: Should we just canonicalize the setcc during DAG combine? - if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) && - ISD::isBuildVectorAllZeros(Op0.getNode())) - std::swap(Op0, Op1); - // Prefer SETGT over SETLT. if (SetCCOpcode == ISD::SETLT) { SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); @@ -20007,7 +20592,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); + SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; @@ -20018,7 +20603,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // This is beneficial because materializing a constant 0 for the PCMPEQ is // probably cheaper than XOR+PCMPGT using 2 different vector constants: // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 - SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); + SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true); if (!UGEOp1) return SDValue(); Op1 = Op0; @@ -20086,14 +20671,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CC0, dl, MVT::i8)); + DAG.getTargetConstant(CC0, dl, MVT::i8)); SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CC1, dl, MVT::i8)); + DAG.getTargetConstant(CC1, dl, MVT::i8)); Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); } else { // Handle all other FP comparisons here. Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(SSECC, dl, MVT::i8)); + DAG.getTargetConstant(SSECC, dl, MVT::i8)); } // If this is SSE/AVX CMPP, bitcast the result back to integer to match the @@ -20106,16 +20691,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, } MVT VTOp0 = Op0.getSimpleValueType(); + (void)VTOp0; assert(VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"); assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() && "Invalid number of packed elements for source and destination!"); - // This is being called by type legalization because v2i32 is marked custom - // for result type legalization for v2f32. - if (VTOp0 == MVT::v2i32) - return SDValue(); - // The non-AVX512 code below works under the assumption that source and // destination types are the same. assert((Subtarget.hasAVX512() || (VT == VTOp0)) && @@ -20153,7 +20734,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; return DAG.getNode(Opc, dl, VT, Op0, Op1, - DAG.getConstant(CmpMode, dl, MVT::i8)); + DAG.getTargetConstant(CmpMode, dl, MVT::i8)); } // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. @@ -20222,21 +20803,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, TLI.isOperationLegal(ISD::UMIN, VT)) { // If we have a constant operand, increment/decrement it and change the // condition to avoid an invert. - if (Cond == ISD::SETUGT && - ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { - return !C->getAPIntValue().isMaxValue(); - })) { + if (Cond == ISD::SETUGT) { // X > C --> X >= (C+1) --> X == umax(X, C+1) - Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT)); - Cond = ISD::SETUGE; + if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) { + Op1 = UGTOp1; + Cond = ISD::SETUGE; + } } - if (Cond == ISD::SETULT && - ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) { - return !C->getAPIntValue().isNullValue(); - })) { + if (Cond == ISD::SETULT) { // X < C --> X <= (C-1) --> X == umin(X, C-1) - Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT)); - Cond = ISD::SETULE; + if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) { + Op1 = ULTOp1; + Cond = ISD::SETULE; + } } bool Invert = false; unsigned Opc; @@ -20360,11 +20939,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, return Result; } -// Try to select this as a KORTEST+SETCC if possible. -static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SDValue &X86CC) { +// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible. +static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + SDValue &X86CC) { // Only support equality comparisons. if (CC != ISD::SETEQ && CC != ISD::SETNE) return SDValue(); @@ -20389,6 +20968,21 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, } else return SDValue(); + // If the input is an AND, we can combine it's operands into the KTEST. + bool KTestable = false; + if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) + KTestable = true; + if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)) + KTestable = true; + if (!isNullConstant(Op1)) + KTestable = false; + if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) { + SDValue LHS = Op0.getOperand(0); + SDValue RHS = Op0.getOperand(1); + X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); + return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS); + } + // If the input is an OR, we can combine it's operands into the KORTEST. SDValue LHS = Op0; SDValue RHS = Op0; @@ -20397,7 +20991,7 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC, RHS = Op0.getOperand(1); } - X86CC = DAG.getConstant(X86Cond, dl, MVT::i8); + X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); } @@ -20425,9 +21019,9 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, return PTEST; } - // Try to lower using KORTEST. - if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) - return KORTEST; + // Try to lower using KORTEST or KTEST. + if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) + return Test; // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. @@ -20442,7 +21036,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, if (Invert) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - X86CC = DAG.getConstant(CCode, dl, MVT::i8); + X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8); } return Op0.getOperand(1); @@ -20456,7 +21050,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - X86CC = DAG.getConstant(CondCode, dl, MVT::i8); + X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; } @@ -20472,6 +21066,19 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + // Handle f128 first, since one possible outcome is a normal integer + // comparison which gets handled by emitFlagsForSetcc. + if (Op0.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1); + + // If softenSetCCOperands returned a scalar, use it. + if (!Op1.getNode()) { + assert(Op0.getValueType() == Op.getValueType() && + "Unexpected setcc expansion!"); + return Op0; + } + } + SDValue X86CC; SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); if (!EFLAGS) @@ -20612,15 +21219,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, - CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); + SDValue Cmp = + DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, + DAG.getTargetConstant(SSECC, DL, MVT::i8)); assert(!VT.isVector() && "Not a scalar type?"); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, - DAG.getConstant(SSECC, DL, MVT::i8)); + DAG.getTargetConstant(SSECC, DL, MVT::i8)); // If we have AVX, we can use a variable vector select (VBLENDV) instead // of 3 logic instructions for size savings and potentially speed. @@ -20718,8 +21326,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); - unsigned CondCode = - cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + unsigned CondCode = Cond.getConstantOperandVal(0); if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { @@ -20807,8 +21414,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { CC = Cond.getOperand(0); SDValue Cmp = Cond.getOperand(1); - MVT VT = Op.getSimpleValueType(); - bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT)) // FPStack? @@ -20826,7 +21431,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { X86::CondCode X86Cond; std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); - CC = DAG.getConstant(X86Cond, DL, MVT::i8); + CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); AddTest = false; } @@ -20848,7 +21453,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (AddTest) { - CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()), X86::COND_NE, DL, DAG); } @@ -20864,9 +21469,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (isNullConstant(Op1) || isNullConstant(Op2))) { - SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), - DAG.getConstant(X86::COND_B, DL, MVT::i8), - Cond); + SDValue Res = + DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; @@ -21037,8 +21642,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. if (Subtarget.hasAVX()) { assert(VT.is256BitVector() && "256-bit vector expected"); - int HalfNumElts = NumElts / 2; - MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); + MVT HalfVT = VT.getHalfNumVectorElementsVT(); + int HalfNumElts = HalfVT.getVectorNumElements(); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef); @@ -21081,7 +21686,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, - DAG.getConstant(SignExtShift, dl, MVT::i8)); + DAG.getTargetConstant(SignExtShift, dl, MVT::i8)); } if (VT == MVT::v2i64) { @@ -21119,7 +21724,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // Custom legalize v8i8->v8i64 on CPUs without avx512bw. if (InVT == MVT::v8i8) { - if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64) + if (VT != MVT::v8i64) return SDValue(); In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), @@ -21138,10 +21743,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // for v4i32 the high shuffle mask will be {2, 3, -1, -1} // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 // concat the vectors to original VT - - MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - + MVT HalfVT = VT.getHalfNumVectorElementsVT(); SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); unsigned NumElems = InVT.getVectorNumElements(); @@ -21165,7 +21767,7 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). - if (Store->isVolatile()) + if (!Store->isSimple()) return SDValue(); MVT StoreVT = StoredVal.getSimpleValueType(); @@ -21201,7 +21803,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, // Splitting volatile memory ops is not allowed unless the operation was not // legal to begin with. We are assuming the input op is legal (this transform // is only used for targets with AVX). - if (Store->isVolatile()) + if (!Store->isSimple()) return SDValue(); MVT StoreSVT = StoreVT.getScalarType(); @@ -21266,14 +21868,13 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && "Unexpected VT"); - if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != - TargetLowering::TypeWidenVector) - return SDValue(); + assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) == + TargetLowering::TypeWidenVector && "Unexpected type action!"); - MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), - StoreVT.getVectorNumElements() * 2); + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, DAG.getUNDEF(StoreVT)); @@ -21313,11 +21914,10 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); SDLoc dl(Ld); - EVT MemVT = Ld->getMemoryVT(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (RegVT.getVectorElementType() == MVT::i1) { - assert(EVT(RegVT) == MemVT && "Expected non-extending load"); + assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"); assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"); @@ -21336,176 +21936,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); } - // Nothing useful we can do without SSE2 shuffles. - assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned RegSz = RegVT.getSizeInBits(); - - ISD::LoadExtType Ext = Ld->getExtensionType(); - - assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) - && "Only anyext and sext are currently implemented."); - assert(MemVT != RegVT && "Cannot extend to the same type"); - assert(MemVT.isVector() && "Must load a vector from memory"); - - unsigned NumElems = RegVT.getVectorNumElements(); - unsigned MemSz = MemVT.getSizeInBits(); - assert(RegSz > MemSz && "Register size must be greater than the mem size"); - - if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) { - // The only way in which we have a legal 256-bit vector result but not the - // integer 256-bit operations needed to directly lower a sextload is if we - // have AVX1 but not AVX2. In that case, we can always emit a sextload to - // a 128-bit vector and a normal sign_extend to 256-bits that should get - // correctly legalized. We do this late to allow the canonical form of - // sextload to persist throughout the rest of the DAG combiner -- it wants - // to fold together any extensions it can, and so will fuse a sign_extend - // of an sextload into a sextload targeting a wider value. - SDValue Load; - if (MemSz == 128) { - // Just switch this to a normal load. - assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " - "it must be a legal 128-bit vector " - "type!"); - Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - } else { - assert(MemSz < 128 && - "Can't extend a type wider than 128 bits to a 256 bit vector!"); - // Do an sext load to a 128-bit vector type. We want to use the same - // number of elements, but elements half as wide. This will end up being - // recursively lowered by this routine, but will succeed as we definitely - // have all the necessary features if we're using AVX1. - EVT HalfEltVT = - EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); - EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); - Load = - DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); - } - - // Replace chain users with the new chain. - assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); - - // Finally, do a normal sign-extend to the desired register. - SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT); - return DAG.getMergeValues({SExt, Load.getValue(1)}, dl); - } - - // All sizes must be a power of two. - assert(isPowerOf2_32(RegSz * MemSz * NumElems) && - "Non-power-of-two elements are not custom lowered!"); - - // Attempt to load the original value using scalar loads. - // Find the largest scalar type that divides the total loaded size. - MVT SclrLoadTy = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { - SclrLoadTy = Tp; - } - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && - (64 <= MemSz)) - SclrLoadTy = MVT::f64; - - // Calculate the number of scalar loads that we need to perform - // in order to load our vector from memory. - unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); - - assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && - "Can only lower sext loads with a single scalar load!"); - - unsigned loadRegSize = RegSz; - if (Ext == ISD::SEXTLOAD && RegSz >= 256) - loadRegSize = 128; - - // If we don't have BWI we won't be able to create the shuffle needed for - // v8i8->v8i64. - if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && - MemVT == MVT::v8i8) - loadRegSize = 128; - - // Represent our vector as a sequence of elements which are the - // largest scalar that we can load. - EVT LoadUnitVecVT = EVT::getVectorVT( - *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits()); - - // Represent the data using the same element type that is stored in - // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegSize / MemVT.getScalarSizeInBits()); - - assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && - "Invalid vector type"); - - // We can't shuffle using an illegal type. - assert(TLI.isTypeLegal(WideVecVT) && - "We only lower types that form legal widened vector types"); - - SmallVector<SDValue, 8> Chains; - SDValue Ptr = Ld->getBasePtr(); - unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8; - SDValue Increment = DAG.getConstant(OffsetInc, dl, - TLI.getPointerTy(DAG.getDataLayout())); - SDValue Res = DAG.getUNDEF(LoadUnitVecVT); - - unsigned Offset = 0; - for (unsigned i = 0; i < NumLoads; ++i) { - unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset); - - // Perform a single load. - SDValue ScalarLoad = - DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, - Ld->getPointerInfo().getWithOffset(Offset), - NewAlign, Ld->getMemOperand()->getFlags()); - Chains.push_back(ScalarLoad.getValue(1)); - // Create the first element type using SCALAR_TO_VECTOR in order to avoid - // another round of DAGCombining. - if (i == 0) - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); - else - Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, - ScalarLoad, DAG.getIntPtrConstant(i, dl)); - - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - Offset += OffsetInc; - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); - - // Bitcast the loaded value to a vector of the original element type, in - // the size of the target vector type. - SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res); - unsigned SizeRatio = RegSz / MemSz; - - if (Ext == ISD::SEXTLOAD) { - SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG); - return DAG.getMergeValues({Sext, TF}, dl); - } - - if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && - MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG); - return DAG.getMergeValues({Sext, TF}, dl); - } - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio] = i; - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), ShuffleVec); - - // Bitcast to the requested type. - Shuff = DAG.getBitcast(RegVT, Shuff); - return DAG.getMergeValues({Shuff, TF}, dl); + return SDValue(); } /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes @@ -21610,7 +22041,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Inverted) X86Cond = X86::GetOppositeBranchCondition(X86Cond); - CC = DAG.getConstant(X86Cond, dl, MVT::i8); + CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); addTest = false; } else { unsigned CondOpc; @@ -21638,10 +22069,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cmp == Cond.getOperand(1).getOperand(1) && isX86LogicalCmp(Cmp) && Op.getNode()->hasOneUse()) { - X86::CondCode CCode = - (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); - CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, dl, MVT::i8); + X86::CondCode CCode0 = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode0 = X86::GetOppositeBranchCondition(CCode0); + CC = DAG.getTargetConstant(CCode0, dl, MVT::i8); SDNode *User = *Op.getNode()->use_begin(); // Look for an unconditional branch following this conditional branch. // We need this because we need to reverse the successors in order @@ -21654,12 +22085,12 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { (void)NewBR; Dest = FalseBB; - Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), - Chain, Dest, CC, Cmp); - X86::CondCode CCode = - (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); - CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, dl, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, + Dest, CC, Cmp); + X86::CondCode CCode1 = + (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); + CCode1 = X86::GetOppositeBranchCondition(CCode1); + CC = DAG.getTargetConstant(CCode1, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -21672,7 +22103,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { X86::CondCode CCode = (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getConstant(CCode, dl, MVT::i8); + CC = DAG.getTargetConstant(CCode, dl, MVT::i8); Cond = Cond.getOperand(0).getOperand(1); addTest = false; } else if (Cond.getOpcode() == ISD::SETCC && @@ -21698,10 +22129,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); - CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -21714,10 +22145,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, Cond.getOperand(0), Cond.getOperand(1)); Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, Dest, CC, Cmp); - CC = DAG.getConstant(X86::COND_P, dl, MVT::i8); + CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); Cond = Cmp; addTest = false; } @@ -21742,7 +22173,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (addTest) { X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; - CC = DAG.getConstant(X86Cond, dl, MVT::i8); + CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()), X86Cond, dl, DAG); } @@ -21770,7 +22201,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned Align = Op.getConstantOperandVal(2); EVT VT = Node->getValueType(0); // Chain the dynamic stack allocation so that it doesn't modify the stack @@ -21811,7 +22242,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); - unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); + Register Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); @@ -21821,7 +22252,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned SPReg = RegInfo->getStackRegister(); + Register SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -22076,7 +22507,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, } return DAG.getNode(Opc, dl, VT, SrcOp, - DAG.getConstant(ShiftAmt, dl, MVT::i8)); + DAG.getTargetConstant(ShiftAmt, dl, MVT::i8)); } /// Handle vector element shifts where the shift amount may or may not be a @@ -22121,7 +22552,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), MVT::v2i64, ShAmt); else { - SDValue ByteShift = DAG.getConstant( + SDValue ByteShift = DAG.getTargetConstant( (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt); ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, @@ -22308,13 +22739,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) - return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; + return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; return false; }; auto isRoundModeSAE = [](SDValue Rnd) { - if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) - return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC; + if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) { + unsigned RC = C->getZExtValue(); + if (RC & X86::STATIC_ROUNDING::NO_EXC) { + // Clear the NO_EXC bit and check remaining bits. + RC ^= X86::STATIC_ROUNDING::NO_EXC; + // As a convenience we allow no other bits or explicitly + // current direction. + return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION; + } + } return false; }; @@ -22335,7 +22774,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, }; SDLoc dl(Op); - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned IntNo = Op.getConstantOperandVal(0); MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { @@ -22411,9 +22850,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); - if (IntrData->Type == INTR_TYPE_3OP_IMM8) - Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); - // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -22666,7 +23102,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); SDValue CC = Op.getOperand(3); - CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC); // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -22685,7 +23120,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); - SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); + SDValue CC = Op.getOperand(3); SDValue Mask = Op.getOperand(4); SDValue Cmp; @@ -22750,16 +23185,16 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case COMI_RM: { // Comparison intrinsics with Sae SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned CondVal = Op.getConstantOperandVal(3); SDValue Sae = Op.getOperand(4); SDValue FCmp; if (isRoundModeCurDirection(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8)); + DAG.getTargetConstant(CondVal, dl, MVT::i8)); else if (isRoundModeSAE(Sae)) FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, - DAG.getConstant(CondVal, dl, MVT::i8), Sae); + DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae); else return SDValue(); // Need to fill with zeros to ensure the bitcast will produce zeroes @@ -22819,9 +23254,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. - SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, - Op.getOperand(2), - DAG.getConstant(0xf, dl, MVT::i32)); + auto Round = cast<ConstantSDNode>(Op.getOperand(2)); + SDValue RoundingMode = + DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), RoundingMode); } @@ -22829,12 +23264,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"); // Clear the upper bits of the rounding immediate so that the legacy // intrinsic can't trigger the scaling behavior of VRNDSCALE. - SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, - Op.getOperand(3), - DAG.getConstant(0xf, dl, MVT::i32)); + auto Round = cast<ConstantSDNode>(Op.getOperand(3)); + SDValue RoundingMode = + DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), RoundingMode); } + case BEXTRI: { + assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode"); + + // The control is a TargetConstant, but we need to convert it to a + // ConstantSDNode. + uint64_t Imm = Op.getConstantOperandVal(2); + SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType()); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), Control); + } // ADC/ADCX/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); @@ -23165,6 +23610,61 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, MaskVT, Operation); return DAG.getMergeValues({Result0, Result1}, DL); } + case Intrinsic::x86_mmx_pslli_w: + case Intrinsic::x86_mmx_pslli_d: + case Intrinsic::x86_mmx_pslli_q: + case Intrinsic::x86_mmx_psrli_w: + case Intrinsic::x86_mmx_psrli_d: + case Intrinsic::x86_mmx_psrli_q: + case Intrinsic::x86_mmx_psrai_w: + case Intrinsic::x86_mmx_psrai_d: { + SDLoc DL(Op); + SDValue ShAmt = Op.getOperand(2); + // If the argument is a constant, convert it to a target constant. + if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) { + ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + Op.getOperand(0), Op.getOperand(1), ShAmt); + } + + unsigned NewIntrinsic; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_mmx_pslli_w: + NewIntrinsic = Intrinsic::x86_mmx_psll_w; + break; + case Intrinsic::x86_mmx_pslli_d: + NewIntrinsic = Intrinsic::x86_mmx_psll_d; + break; + case Intrinsic::x86_mmx_pslli_q: + NewIntrinsic = Intrinsic::x86_mmx_psll_q; + break; + case Intrinsic::x86_mmx_psrli_w: + NewIntrinsic = Intrinsic::x86_mmx_psrl_w; + break; + case Intrinsic::x86_mmx_psrli_d: + NewIntrinsic = Intrinsic::x86_mmx_psrl_d; + break; + case Intrinsic::x86_mmx_psrli_q: + NewIntrinsic = Intrinsic::x86_mmx_psrl_q; + break; + case Intrinsic::x86_mmx_psrai_w: + NewIntrinsic = Intrinsic::x86_mmx_psra_w; + break; + case Intrinsic::x86_mmx_psrai_d: + NewIntrinsic = Intrinsic::x86_mmx_psra_d; + break; + } + + // The vector shift intrinsics with scalars uses 32b shift amounts but + // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an + // MMX register. + ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(NewIntrinsic, DL, MVT::i32), + Op.getOperand(1), ShAmt); + + } } } @@ -23177,7 +23677,9 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); // If source is undef or we know it won't be used, use a zero vector @@ -23204,7 +23706,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), VT.getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -23238,7 +23742,9 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), Src.getSimpleValueType().getVectorNumElements()); MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); @@ -23266,7 +23772,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, // Scale must be constant. if (!C) return SDValue(); - SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, + TLI.getPointerTy(DAG.getDataLayout())); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); MVT MaskVT = @@ -23435,8 +23943,7 @@ EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - + unsigned IntNo = Op.getConstantOperandVal(1); const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { switch (IntNo) { @@ -23538,10 +24045,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. - SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), - DAG.getConstant(1, dl, Op->getValueType(1)), - DAG.getConstant(X86::COND_B, dl, MVT::i8), - SDValue(Result.getNode(), 1) }; + SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), + DAG.getConstant(1, dl, Op->getValueType(1)), + DAG.getTargetConstant(X86::COND_B, dl, MVT::i8), + SDValue(Result.getNode(), 1)}; SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); // Return { result, isValid, chain }. @@ -23581,8 +24088,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, Scale, Chain, Subtarget); } case PREFETCH: { - SDValue Hint = Op.getOperand(6); - unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue(); + const APInt &HintVal = Op.getConstantOperandAPInt(6); assert((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"); unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); @@ -23678,7 +24184,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); - unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned Depth = Op.getConstantOperandVal(0); SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -23730,7 +24236,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); SDLoc dl(Op); // FIXME probably not meaningful - unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned Depth = Op.getConstantOperandVal(0); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -23743,12 +24249,11 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { +Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const MachineFunction &MF = DAG.getMachineFunction(); - unsigned Reg = StringSwitch<unsigned>(RegName) + Register Reg = StringSwitch<unsigned>(RegName) .Case("esp", X86::ESP) .Case("rsp", X86::RSP) .Case("ebp", X86::EBP) @@ -23762,8 +24267,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT, #ifndef NDEBUG else { const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF); assert((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"); } @@ -23809,7 +24313,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); @@ -23967,6 +24471,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::Fast: + case CallingConv::Tail: // Pass 'nest' parameter in EAX. // Must be kept in sync with X86CallingConv.td NestReg = X86::EAX; @@ -24279,12 +24784,9 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, if (Opc == ISD::CTLZ) { // If src is zero (i.e. bsr sets ZF), returns NumBits. - SDValue Ops[] = { - Op, - DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), - DAG.getConstant(X86::COND_E, dl, MVT::i8), - Op.getValue(1) - }; + SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), + DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1)}; Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); } @@ -24312,12 +24814,9 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); // If src is zero (i.e. bsf sets ZF), returns NumBits. - SDValue Ops[] = { - Op, - DAG.getConstant(NumBits, dl, VT), - DAG.getConstant(X86::COND_E, dl, MVT::i8), - Op.getValue(1) - }; + SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT), + DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), + Op.getValue(1)}; return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } @@ -24453,7 +24952,7 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SDValue N0 = Op.getOperand(0); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, VT), N0); - SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8), + SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8), SDValue(Neg.getNode(), 1)}; return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); } @@ -25033,7 +25532,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // Optimize shl/srl/sra with constant shift amount. APInt APIntShiftAmt; - if (!isConstantSplat(Amt, APIntShiftAmt)) + if (!X86::isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); // If the shift amount is out of range, return undef. @@ -25220,7 +25719,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, } ConstantSDNode *ND = cast<ConstantSDNode>(Op); - APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); + APInt C(SVTBits, ND->getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); @@ -25502,7 +26001,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, (VT == MVT::v32i8 && Subtarget.hasInt256())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); - SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8); + SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); // Extend constant shift amount to vXi16 (it doesn't matter if the type // isn't legal). @@ -25774,7 +26273,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); return DAG.getNode(Op, DL, VT, R, - DAG.getConstant(RotateAmt, DL, MVT::i8)); + DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); } // Else, fall-back on VPROLV/VPRORV. @@ -25795,7 +26294,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (0 <= CstSplatIndex) { uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); return DAG.getNode(X86ISD::VROTLI, DL, VT, R, - DAG.getConstant(RotateAmt, DL, MVT::i8)); + DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); } // Use general rotate by variable (per-element). @@ -26032,7 +26531,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // If this is a canonical idempotent atomicrmw w/no uses, we have a better // lowering available in lowerAtomicArith. - // TODO: push more cases through this path. + // TODO: push more cases through this path. if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand())) if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && AI->use_empty()) @@ -26087,10 +26586,22 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { return Loaded; } +bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const { + if (!SI.isUnordered()) + return false; + return ExperimentalUnorderedISEL; +} +bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const { + if (!LI.isUnordered()) + return false; + return ExperimentalUnorderedISEL; +} + + /// Emit a locked operation on a stack location which does not change any /// memory location, but does involve a lock prefix. Location is chosen to be /// a) very likely accessed only by a single thread to minimize cache traffic, -/// and b) definitely dereferenceable. Returns the new Chain result. +/// and b) definitely dereferenceable. Returns the new Chain result. static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, SDLoc DL) { @@ -26099,22 +26610,22 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG, // operations issued by the current processor. As such, the location // referenced is not relevant for the ordering properties of the instruction. // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, - // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions + // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions // 2) Using an immediate operand appears to be the best encoding choice // here since it doesn't require an extra register. // 3) OR appears to be very slightly faster than ADD. (Though, the difference // is small enough it might just be measurement noise.) // 4) When choosing offsets, there are several contributing factors: // a) If there's no redzone, we default to TOS. (We could allocate a cache - // line aligned stack object to improve this case.) + // line aligned stack object to improve this case.) // b) To minimize our chances of introducing a false dependence, we prefer - // to offset the stack usage from TOS slightly. + // to offset the stack usage from TOS slightly. // c) To minimize concerns about cross thread stack usage - in particular, // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which // captures state in the TOS frame and accesses it from many threads - // we want to use an offset such that the offset is in a distinct cache // line from the TOS frame. - // + // // For a general discussion of the tradeoffs and benchmark results, see: // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ @@ -26155,10 +26666,10 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG, static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); - AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( - cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); - SyncScope::ID FenceSSID = static_cast<SyncScope::ID>( - cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); + AtomicOrdering FenceOrdering = + static_cast<AtomicOrdering>(Op.getConstantOperandVal(1)); + SyncScope::ID FenceSSID = + static_cast<SyncScope::ID>(Op.getConstantOperandVal(2)); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. @@ -26167,7 +26678,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); - SDValue Chain = Op.getOperand(0); + SDValue Chain = Op.getOperand(0); return emitLockedStackOp(DAG, Subtarget, Chain, dl); } @@ -26218,6 +26729,17 @@ static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT InVT = V.getSimpleValueType(); + if (InVT == MVT::v64i8) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(V, DL); + Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget); + Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget); + Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); + Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); + Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, + DAG.getConstant(32, DL, MVT::i8)); + return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); + } if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(V, DL); @@ -26258,8 +26780,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); - EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(), - DstVT.getVectorNumElements() / 2); + MVT CastVT = DstVT.getHalfNumVectorElementsVT(); Lo = DAG.getBitcast(CastVT, Lo); Hi = DAG.getBitcast(CastVT, Hi); return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); @@ -26275,53 +26796,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, return DAG.getZExtOrTrunc(V, DL, DstVT); } - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || - SrcVT == MVT::i64) { - assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - if (DstVT != MVT::f64 && DstVT != MVT::i64 && - !(DstVT == MVT::x86mmx && SrcVT.isVector())) - // This conversion needs to be expanded. - return SDValue(); + assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) && "Unexpected VT!"); - SDLoc dl(Op); - if (SrcVT.isVector()) { - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. - MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), - SrcVT.getVectorNumElements() * 2); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, - DAG.getUNDEF(SrcVT)); - } else { - assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && - "Unexpected source type in LowerBITCAST"); - Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); - } + assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); + if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) && + !(DstVT == MVT::x86mmx && SrcVT.isVector())) + // This conversion needs to be expanded. + return SDValue(); - MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; - Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); + SDLoc dl(Op); + if (SrcVT.isVector()) { + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), + SrcVT.getVectorNumElements() * 2); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, + DAG.getUNDEF(SrcVT)); + } else { + assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() && + "Unexpected source type in LowerBITCAST"); + Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); + } - if (DstVT == MVT::x86mmx) - return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); + MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; + Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, - DAG.getIntPtrConstant(0, dl)); - } + if (DstVT == MVT::x86mmx) + return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); - assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() && - Subtarget.hasMMX() && "Unexpected custom BITCAST"); - assert((DstVT == MVT::i64 || - (DstVT.isVector() && DstVT.getSizeInBits()==64)) && - "Unexpected custom BITCAST"); - // i64 <=> MMX conversions are Legal. - if (SrcVT==MVT::i64 && DstVT.isVector()) - return Op; - if (DstVT==MVT::i64 && SrcVT.isVector()) - return Op; - // MMX <=> MMX conversions are Legal. - if (SrcVT.isVector() && DstVT.isVector()) - return Op; - // All other conversions need to be expanded. - return SDValue(); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, + DAG.getIntPtrConstant(0, dl)); } /// Compute the horizontal sum of bytes in V for the elements of VT. @@ -26549,6 +27054,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SDValue In = Op.getOperand(0); SDLoc DL(Op); + // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB + // lowering. + if (VT == MVT::v8i64 || VT == MVT::v16i32) { + assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"); + return Lower512IntUnary(Op, DAG); + } + unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"); @@ -26656,12 +27168,12 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, // seq_cst which isn't SingleThread, everything just needs to be preserved // during codegen and then dropped. Note that we expect (but don't assume), // that orderings other than seq_cst and acq_rel have been canonicalized to - // a store or load. + // a store or load. if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent && AN->getSyncScopeID() == SyncScope::System) { // Prefer a locked operation against a stack location to minimize cache // traffic. This assumes that stack locations are very likely to be - // accessed only by the owning thread. + // accessed only by the owning thread. SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); assert(!N->hasAnyUseOfValue(0)); // NOTE: The getUNDEF is needed to give something for the unused result 0. @@ -26886,12 +27398,13 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); - if (VT == MVT::v2f32) { + if (VT == MVT::v2f32 || VT == MVT::v2i32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32)); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( @@ -26901,30 +27414,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, return SDValue(); } - if (VT == MVT::v2i32) { - assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, - DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 and we have VLX we can use xmm for data and index. - if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { - SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); - SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; - SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( - VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - return SDValue(NewScatter.getNode(), 1); - } - // Custom widen all the operands to avoid promotion. - EVT NewIndexVT = EVT::getVectorVT( - *DAG.getContext(), Index.getValueType().getVectorElementType(), 4); - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(Index.getValueType())); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); - SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; - return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl, - Ops, N->getMemOperand()); - } - MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); @@ -27160,6 +27649,13 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op, return NOOP; } +SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const { + SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -27206,10 +27702,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: - case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget); + case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -27347,37 +27847,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::MUL: { EVT VT = N->getValueType(0); - assert(VT.isVector() && "Unexpected VT"); - if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger && - VT.getVectorNumElements() == 2) { - // Promote to a pattern that will be turned into PMULUDQ. - SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, - N->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, - N->getOperand(1)); - SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1); - Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); - } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && - VT.getVectorElementType() == MVT::i8) { - // Pre-promote these to vXi16 to avoid op legalization thinking all 16 - // elements are needed. - MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); - SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); - SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); - SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); - Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); - unsigned NumConcats = 16 / VT.getVectorNumElements(); - SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); - ConcatOps[0] = Res; - Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); - Results.push_back(Res); - } + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"); + // Pre-promote these to vXi16 to avoid op legalization thinking all 16 + // elements are needed. + MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); + SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); + Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); + unsigned NumConcats = 16 / VT.getVectorNumElements(); + SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); + ConcatOps[0] = Res; + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); + Results.push_back(Res); return; } - case ISD::UADDSAT: - case ISD::SADDSAT: - case ISD::USUBSAT: - case ISD::SSUBSAT: case X86ISD::VPMADDWD: case X86ISD::AVG: { // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and @@ -27388,6 +27873,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT InVT = N->getOperand(0).getValueType(); assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."); + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); unsigned NumConcat = 128 / InVT.getSizeInBits(); EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), @@ -27404,9 +27891,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, - DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } @@ -27435,26 +27919,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(Hi); return; } - case ISD::SETCC: { - // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when - // setCC result type is v2i1 because type legalzation will end up with - // a v4i1 setcc plus an extend. - assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type"); - if (N->getOperand(0).getValueType() != MVT::v2f32 || - getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector) - return; - SDValue UNDEF = DAG.getUNDEF(MVT::v2f32); - SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(0), UNDEF); - SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, - N->getOperand(1), UNDEF); - SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS, - N->getOperand(2)); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: @@ -27475,7 +27939,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SREM: case ISD::UREM: { EVT VT = N->getValueType(0); - if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) { + if (VT.isVector()) { + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); // If this RHS is a constant splat vector we can widen this and let // division/remainder by constant optimize it. // TODO: Can we do something for non-splat? @@ -27493,17 +27959,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (VT == MVT::v2i32) { - // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the - // v2i64 and unroll later. But then we create i64 scalar ops which - // might be slow in 64-bit mode or require a libcall in 32-bit mode. - Results.push_back(DAG.UnrollVectorOp(N)); - return; - } - - if (VT.isVector()) - return; - LLVM_FALLTHROUGH; } case ISD::SDIVREM: @@ -27561,58 +28016,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } } - return; - } - case ISD::SIGN_EXTEND_VECTOR_INREG: { - if (ExperimentalVectorWideningLegalization) - return; - - EVT VT = N->getValueType(0); - SDValue In = N->getOperand(0); - EVT InVT = In.getValueType(); - if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v16i16 || InVT == MVT::v32i8)) { - // Custom split this so we can extend i8/i16->i32 invec. This is better - // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using - // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting - // we allow the sra from the extend to i32 to be shared by the split. - EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(), - InVT.getVectorElementType(), - InVT.getVectorNumElements() / 2); - MVT ExtendVT = MVT::getVectorVT(MVT::i32, - VT.getVectorNumElements()); - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT, - In, DAG.getIntPtrConstant(0, dl)); - In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In); - - // Fill a vector with sign bits for each element. - SDValue Zero = DAG.getConstant(0, dl, ExtendVT); - SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT); - - EVT LoVT, HiVT; - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); - - // Create an unpackl and unpackh to interleave the sign bits then bitcast - // to vXi64. - SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits); - Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo); - SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits); - Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); + if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 && + getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector && + isTypeLegal(MVT::v4i64)) { + // Input needs to be split and output needs to widened. Let's use two + // VTRUNCs, and shuffle their results together into the wider type. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(In, dl); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); + Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); + SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, + { 0, 1, 2, 3, 16, 17, 18, 19, + -1, -1, -1, -1, -1, -1, -1, -1 }); Results.push_back(Res); return; } + return; } + case ISD::ANY_EXTEND: + // Right now, only MVT::v8i8 has Custom action for an illegal type. + // It's intended to custom handle the input type. + assert(N->getValueType(0) == MVT::v8i8 && + "Do not know how to legalize this Node"); + return; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); EVT InVT = In.getValueType(); if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && - (InVT == MVT::v4i16 || InVT == MVT::v4i8) && - getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) { + (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ + assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && + "Unexpected type action!"); assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"); // Custom split this so we can extend i8/i16->i32 invec. This is better // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using @@ -27683,27 +28120,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); - // Promote these manually to avoid over promotion to v2i64. Type - // legalization will revisit the v2i32 operation for more cleanup. - if ((VT == MVT::v2i8 || VT == MVT::v2i16) && - getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) { - // AVX512DQ provides instructions that produce a v2i64 result. - if (Subtarget.hasDQI()) - return; - - SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src); - Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext - : ISD::AssertSext, - dl, MVT::v2i32, Res, - DAG.getValueType(VT.getVectorElementType())); - Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); - Results.push_back(Res); - return; - } - if (VT.isVector() && VT.getScalarSizeInBits() < 32) { - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - return; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); // Try to create a 128 bit vector, but don't exceed a 32 bit element. unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); @@ -27738,35 +28157,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"); assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); - bool Widenv2i32 = - getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { - // If v2i32 is widened, we can defer to the generic legalizer. - if (Widenv2i32) - return; - // Custom widen by doubling to a legal vector with. Isel will - // further widen to v8f64. - Opc = ISD::FP_TO_UINT; - Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, - Src, DAG.getUNDEF(MVT::v2f64)); + // If we have VLX we can emit a target specific FP_TO_UINT node, + // otherwise we can defer to the generic legalizer which will widen + // the input as well. This will be further widened during op + // legalization to v8i32<-v8f64. + return; } + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); - if (!Widenv2i32) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - return; - } - if (SrcVT == MVT::v2f32 && - getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { - SDValue Idx = DAG.getIntPtrConstant(0, dl); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32)); - Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT - : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -27776,6 +28178,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } + assert(!VT.isVector() && "Vectors should have been handled above!"); + if (Subtarget.hasDQI() && VT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); @@ -27847,7 +28251,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + unsigned IntNo = N->getConstantOperandVal(1); switch (IntNo) { default : llvm_unreachable("Do not know how to custom type " "legalize this intrinsic operation!"); @@ -27905,7 +28309,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); SDValue Result; SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - unsigned BasePtr = TRI->getBaseRegister(); + Register BasePtr = TRI->getBaseRegister(); MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); if (TRI->hasBasePointer(DAG.getMachineFunction()) && (BasePtr == X86::RBX || BasePtr == X86::EBX)) { @@ -28060,34 +28464,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (SrcVT != MVT::f64 || - (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || - getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) + if (DstVT.isVector() && SrcVT == MVT::x86mmx) { + assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && + "Unexpected type action!"); + EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); + SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0)); + Results.push_back(Res); return; + } - unsigned NumElts = DstVT.getVectorNumElements(); - EVT SVT = DstVT.getVectorElementType(); - EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); - SDValue Res; - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); - Res = DAG.getBitcast(WiderVT, Res); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); return; } case ISD::MGATHER: { EVT VT = N->getValueType(0); - if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if ((VT == MVT::v2f32 || VT == MVT::v2i32) && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { auto *Gather = cast<MaskedGatherSDNode>(N); SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); + EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); SDValue Mask = Gather->getMask(); assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Gather->getPassThru(), - DAG.getUNDEF(MVT::v2f32)); + DAG.getUNDEF(VT)); if (!Subtarget.hasVLX()) { // We need to widen the mask, but the instruction will only use 2 // of its elements. So we can use undef. @@ -28098,66 +28501,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, + DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl, Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); Results.push_back(Res.getValue(2)); return; } - if (VT == MVT::v2i32) { - auto *Gather = cast<MaskedGatherSDNode>(N); - SDValue Index = Gather->getIndex(); - SDValue Mask = Gather->getMask(); - assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); - SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, - Gather->getPassThru(), - DAG.getUNDEF(MVT::v2i32)); - // If the index is v2i64 we can use it directly. - if (Index.getValueType() == MVT::v2i64 && - (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { - if (!Subtarget.hasVLX()) { - // We need to widen the mask, but the instruction will only use 2 - // of its elements. So we can use undef. - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getUNDEF(MVT::v2i1)); - Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); - } - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, - Gather->getMemoryVT(), Gather->getMemOperand()); - SDValue Chain = Res.getValue(2); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; - } - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) { - EVT IndexVT = Index.getValueType(); - EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), - IndexVT.getScalarType(), 4); - // Otherwise we need to custom widen everything to avoid promotion. - Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, - DAG.getUNDEF(IndexVT)); - Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, - DAG.getConstant(0, dl, MVT::v2i1)); - SDValue Ops[] = { Gather->getChain(), PassThru, Mask, - Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), - Gather->getMemoryVT(), dl, Ops, - Gather->getMemOperand()); - SDValue Chain = Res.getValue(1); - if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, - DAG.getIntPtrConstant(0, dl)); - Results.push_back(Res); - Results.push_back(Chain); - return; - } - } return; } case ISD::LOAD: { @@ -28166,8 +28515,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // cast since type legalization will try to use an i64 load. MVT VT = N->getSimpleValueType(0); assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); - if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) - return; + assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && + "Unexpected type action!"); if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast<LoadSDNode>(N); @@ -28177,11 +28526,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - MVT WideVT = MVT::getVectorVT(LdVT, 2); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); - MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), - VT.getVectorNumElements() * 2); - Res = DAG.getBitcast(CastVT, Res); + MVT VecVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res); + EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); + Res = DAG.getBitcast(WideVT, Res); Results.push_back(Res); Results.push_back(Chain); return; @@ -28236,6 +28584,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; case X86ISD::Wrapper: return "X86ISD::Wrapper"; case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; + case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ"; case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; @@ -28373,6 +28722,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -28737,6 +29087,9 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { } bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0))) + return false; + EVT SrcVT = ExtVal.getOperand(0).getValueType(); // There is no extending load for vXi1. @@ -28856,10 +29209,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned DstReg = MI.getOperand(0).getReg(); + Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(DstReg); - unsigned mainDstReg = MRI.createVirtualRegister(RC); - unsigned fallDstReg = MRI.createVirtualRegister(RC); + Register mainDstReg = MRI.createVirtualRegister(RC); + Register fallDstReg = MRI.createVirtualRegister(RC); // thisMBB: // xbegin fallMBB @@ -28913,7 +29266,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, static_assert(X86::AddrNumOperands == 5, "VAARG_64 assumes 5 address operands"); - unsigned DestReg = MI.getOperand(0).getReg(); + Register DestReg = MI.getOperand(0).getReg(); MachineOperand &Base = MI.getOperand(1); MachineOperand &Scale = MI.getOperand(2); MachineOperand &Index = MI.getOperand(3); @@ -29049,7 +29402,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, assert(OffsetReg != 0); // Read the reg_save_area address. - unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); + Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) .add(Base) .add(Scale) @@ -29059,8 +29412,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .setMemRefs(LoadOnlyMMO); // Zero-extend the offset - unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); - BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) .addImm(0) .addReg(OffsetReg) .addImm(X86::sub_32bit); @@ -29071,7 +29424,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, .addReg(RegSaveReg); // Compute the offset for the next argument - unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); + Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) .addReg(OffsetReg) .addImm(UseFPOffset ? 16 : 8); @@ -29096,7 +29449,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // // Load the overflow_area address into a register. - unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); + Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) .add(Base) .add(Scale) @@ -29110,7 +29463,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, if (NeedsAlign) { // Align the overflow address assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); - unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); + Register TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) @@ -29127,7 +29480,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // Compute the next overflow address after this argument. // (the overflow address should be kept 8-byte aligned) - unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); + Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass); BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) .addReg(OverflowDestReg) .addImm(ArgSizeA8); @@ -29191,7 +29544,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); - unsigned CountReg = MI.getOperand(0).getReg(); + Register CountReg = MI.getOperand(0).getReg(); int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); @@ -29273,7 +29626,9 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CMOV_FR32: + case X86::CMOV_FR32X: case X86::CMOV_FR64: + case X86::CMOV_FR64X: case X86::CMOV_GR8: case X86::CMOV_GR16: case X86::CMOV_GR32: @@ -29326,9 +29681,9 @@ static MachineInstrBuilder createPHIsForCMOVsInSinkBB( MachineInstrBuilder MIB; for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned Op1Reg = MIIt->getOperand(1).getReg(); - unsigned Op2Reg = MIIt->getOperand(2).getReg(); + Register DestReg = MIIt->getOperand(0).getReg(); + Register Op1Reg = MIIt->getOperand(1).getReg(); + Register Op2Reg = MIIt->getOperand(2).getReg(); // If this CMOV we are generating is the opposite condition from // the jump we generated, then we have to swap the operands for the @@ -29486,9 +29841,9 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, // SinkMBB: // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] - unsigned DestReg = FirstCMOV.getOperand(0).getReg(); - unsigned Op1Reg = FirstCMOV.getOperand(1).getReg(); - unsigned Op2Reg = FirstCMOV.getOperand(2).getReg(); + Register DestReg = FirstCMOV.getOperand(0).getReg(); + Register Op1Reg = FirstCMOV.getOperand(1).getReg(); + Register Op2Reg = FirstCMOV.getOperand(2).getReg(); MachineInstrBuilder MIB = BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg) .addReg(Op1Reg) @@ -30006,7 +30361,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI, // call the retpoline thunk. DebugLoc DL = MI.getDebugLoc(); const X86InstrInfo *TII = Subtarget.getInstrInfo(); - unsigned CalleeVReg = MI.getOperand(0).getReg(); + Register CalleeVReg = MI.getOperand(0).getReg(); unsigned Opc = getOpcodeForRetpoline(MI.getOpcode()); // Find an available scratch register to hold the callee. On 64-bit, we can @@ -30079,7 +30434,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, // Initialize a register with zero. MVT PVT = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *PtrRC = getRegClassFor(PVT); - unsigned ZReg = MRI.createVirtualRegister(PtrRC); + Register ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(*MBB, MI, DL, TII->get(XorRROpc)) .addDef(ZReg) @@ -30087,7 +30442,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. - unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC); + Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); @@ -30131,8 +30486,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, const TargetRegisterClass *RC = MRI.getRegClass(DstReg); assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"); (void)TRI; - unsigned mainDstReg = MRI.createVirtualRegister(RC); - unsigned restoreDstReg = MRI.createVirtualRegister(RC); + Register mainDstReg = MRI.createVirtualRegister(RC); + Register restoreDstReg = MRI.createVirtualRegister(RC); MemOpndSlot = CurOp; @@ -30246,8 +30601,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); X86FI->setRestoreBasePointer(MF); - unsigned FramePtr = RegInfo->getFrameRegister(*MF); - unsigned BasePtr = RegInfo->getBaseRegister(); + Register FramePtr = RegInfo->getFrameRegister(*MF); + Register BasePtr = RegInfo->getBaseRegister(); unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), FramePtr, true, X86FI->getRestoreBasePointerOffset()) @@ -30329,7 +30684,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MBB->addSuccessor(checkSspMBB); // Initialize a register with zero. - unsigned ZReg = MRI.createVirtualRegister(PtrRC); + Register ZReg = MRI.createVirtualRegister(PtrRC); unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; BuildMI(checkSspMBB, DL, TII->get(XorRROpc)) .addDef(ZReg) @@ -30337,7 +30692,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, .addReg(ZReg, RegState::Undef); // Read the current SSP Register value to the zeroed register. - unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC); + Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); @@ -30352,7 +30707,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, checkSspMBB->addSuccessor(fallMBB); // Reload the previously saved SSP register value. - unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC); + Register PrevSSPReg = MRI.createVirtualRegister(PtrRC); unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; const int64_t SPPOffset = 3 * PVT.getStoreSize(); MachineInstrBuilder MIB = @@ -30370,7 +30725,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MIB.setMemRefs(MMOs); // Subtract the current SSP from the previous SSP. - unsigned SspSubReg = MRI.createVirtualRegister(PtrRC); + Register SspSubReg = MRI.createVirtualRegister(PtrRC); unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr; BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg) .addReg(PrevSSPReg) @@ -30384,7 +30739,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8. unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri; unsigned Offset = (PVT == MVT::i64) ? 3 : 2; - unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC); + Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg) .addReg(SspSubReg) .addImm(Offset); @@ -30394,7 +30749,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg); // Reset the lower 8 bits. - unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC); + Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg) .addReg(SspFirstShrReg) .addImm(8); @@ -30406,12 +30761,12 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, // Do a single shift left. unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1; - unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC); + Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg) .addReg(SspSecondShrReg); // Save the value 128 to a register (will be used next with incssp). - unsigned Value128InReg = MRI.createVirtualRegister(PtrRC); + Register Value128InReg = MRI.createVirtualRegister(PtrRC); unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri; BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg) .addImm(128); @@ -30419,8 +30774,8 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, // Since incssp only looks at the lower 8 bits, we might need to do several // iterations of incssp until we finish fixing the shadow stack. - unsigned DecReg = MRI.createVirtualRegister(PtrRC); - unsigned CounterReg = MRI.createVirtualRegister(PtrRC); + Register DecReg = MRI.createVirtualRegister(PtrRC); + Register CounterReg = MRI.createVirtualRegister(PtrRC); BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg) .addReg(SspAfterShlReg) .addMBB(fixShadowLoopPrepareMBB) @@ -30460,11 +30815,11 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, const TargetRegisterClass *RC = (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; - unsigned Tmp = MRI.createVirtualRegister(RC); + Register Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; - unsigned SP = RegInfo->getStackRegister(); + Register SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; @@ -30662,8 +31017,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>(); MFI->setRestoreBasePointer(MF); - unsigned FP = RI.getFrameRegister(*MF); - unsigned BP = RI.getBaseRegister(); + Register FP = RI.getFrameRegister(*MF); + Register BP = RI.getBaseRegister(); unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, MFI->getRestoreBasePointerOffset()) @@ -30674,7 +31029,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, } // IReg is used as an index in a memory operand and therefore can't be SP - unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); + Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, Subtarget.is64Bit() ? 8 : 4); BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) @@ -30683,8 +31038,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); if (Subtarget.is64Bit()) { - unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); - unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); // leaq .LJTI0_0(%rip), BReg BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg) @@ -30710,9 +31065,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(0); break; case MachineJumpTableInfo::EK_LabelDifference32: { - unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass); - unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); - unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); + Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass); // movl (BReg,IReg64,4), OReg BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg) @@ -30783,8 +31138,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, DefRegs[MOp.getReg()] = true; MachineInstrBuilder MIB(*MF, &II); - for (unsigned RI = 0; SavedRegs[RI]; ++RI) { - unsigned Reg = SavedRegs[RI]; + for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) { + unsigned Reg = SavedRegs[RegIdx]; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } @@ -30906,20 +31261,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); // Load the old value of the control word... - unsigned OldCW = - MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), OrigCWFrameIdx); // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. - unsigned NewCW = - MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) .addReg(OldCW, RegState::Kill).addImm(0xC00); // Extract to 16 bits. - unsigned NewCW16 = - MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); + Register NewCW16 = + MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) .addReg(NewCW, RegState::Kill, X86::sub_16bit); @@ -31023,7 +31376,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineRegisterInfo &MRI = MF->getRegInfo(); MVT SPTy = getPointerTy(MF->getDataLayout()); const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); - unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); + Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); X86AddressMode AM = getAddressFromInstr(&MI, 0); // Regalloc does not need any help when the memory operand of CMPXCHG8B @@ -31034,10 +31387,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its // four operand definitions that are E[ABCD] registers. We skip them and // then insert the LEA. - MachineBasicBlock::iterator MBBI(MI); - while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) || - MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX)) - --MBBI; + MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator()); + while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) || + RMBBI->definesRegister(X86::EBX) || + RMBBI->definesRegister(X86::ECX) || + RMBBI->definesRegister(X86::EDX))) { + ++RMBBI; + } + MachineBasicBlock::iterator MBBI(RMBBI); addFullAddress( BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM); @@ -31232,12 +31589,21 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.One |= Known2.One; break; } + case X86ISD::PSADBW: { + assert(VT.getScalarType() == MVT::i64 && + Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && + "Unexpected PSADBW types"); + + // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result. + Known.Zero.setBitsFrom(16); + break; + } case X86ISD::CMOV: { - Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1); + Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); // If we don't know any bits, early out. if (Known.isUnknown()) break; - KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1); + KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); // Only known if known in both the LHS and RHS. Known.One &= Known2.One; @@ -31650,8 +32016,8 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { - ArrayRef<int> LoMask(Mask.data() + 0, 4); - ArrayRef<int> HiMask(Mask.data() + 4, 4); + ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4); + ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4); // PSHUFLW: permute lower 4 elements only. if (isUndefOrInRange(LoMask, 0, 4) && @@ -31789,8 +32155,8 @@ static bool matchBinaryPermuteShuffle( uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end()); - if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero, - BlendMask)) { + if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero, + ForceV2Zero, BlendMask)) { if (MaskVT == MVT::v16i16) { // We can only use v16i16 PBLENDW if the lanes are repeated. SmallVector<int, 8> RepeatedMask; @@ -31819,15 +32185,15 @@ static bool matchBinaryPermuteShuffle( } } - // Attempt to combine to INSERTPS. + // Attempt to combine to INSERTPS, but only if it has elements that need to + // be set to zero. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && - MaskVT.is128BitVector()) { - if (Zeroable.getBoolValue() && - matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { - Shuffle = X86ISD::INSERTPS; - ShuffleVT = MVT::v4f32; - return true; - } + MaskVT.is128BitVector() && + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) && + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; } // Attempt to combine to SHUFPD. @@ -31835,7 +32201,11 @@ static bool matchBinaryPermuteShuffle( ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || (MaskVT.is256BitVector() && Subtarget.hasAVX()) || (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) { + bool ForceV1Zero = false, ForceV2Zero = false; + if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, + PermuteImm, Mask, Zeroable)) { + V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; + V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); return true; @@ -31889,6 +32259,15 @@ static bool matchBinaryPermuteShuffle( } } + // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed. + if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && + MaskVT.is128BitVector() && + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; + } + return false; } @@ -31942,7 +32321,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || - (RootVT.isFloatingPoint() && Depth >= 2) || + (RootVT.isFloatingPoint() && Depth >= 1) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); // Don't combine if we are a AVX512/EVEX target and the mask element size @@ -31981,7 +32360,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) + if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); unsigned PermMask = 0; @@ -31991,7 +32370,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Res = DAG.getBitcast(ShuffleVT, V1); Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, DAG.getUNDEF(ShuffleVT), - DAG.getConstant(PermMask, DL, MVT::i8)); + DAG.getTargetConstant(PermMask, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -32026,8 +32405,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. // TODO: Should we indicate which domain is preferred if both are allowed? - bool AllowFloatDomain = FloatDomain || (Depth > 3); - bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && + bool AllowFloatDomain = FloatDomain || (Depth >= 3); + bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. @@ -32062,14 +32441,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && MayFoldLoad(V1.getOperand(0))) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); return DAG.getBitcast(RootVT, Res); } if (Subtarget.hasAVX2()) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = DAG.getBitcast(MaskVT, V1); Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); @@ -32083,7 +32462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleSrcVT, NewV1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); @@ -32094,11 +32473,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, - DAG.getConstant(PermuteImm, DL, MVT::i8)); + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } @@ -32109,7 +32488,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1); NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2); @@ -32123,12 +32502,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { - if (Depth == 1 && Root.getOpcode() == Shuffle) + if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleVT, NewV1); NewV2 = DAG.getBitcast(ShuffleVT, NewV2); Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, - DAG.getConstant(PermuteImm, DL, MVT::i8)); + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -32141,34 +32520,34 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) + if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) + if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI) return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); V2 = DAG.getBitcast(IntMaskVT, V2); Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); + DAG.getTargetConstant(BitLen, DL, MVT::i8), + DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. - if (Depth < 2) + if (Depth < 1) return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. - int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; + int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2; AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = @@ -32321,7 +32700,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, V2 = DAG.getBitcast(MaskVT, V2); SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, - DAG.getConstant(M2ZImm, DL, MVT::i8)); + DAG.getTargetConstant(M2ZImm, DL, MVT::i8)); return DAG.getBitcast(RootVT, Res); } @@ -32650,7 +33029,7 @@ static SDValue combineX86ShufflesRecursively( // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. const unsigned MaxRecursionDepth = 8; - if (Depth > MaxRecursionDepth) + if (Depth >= MaxRecursionDepth) return SDValue(); // Directly rip through bitcasts to find the underlying operand. @@ -32667,11 +33046,18 @@ static SDValue combineX86ShufflesRecursively( "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. + // TODO - determine Op's demanded elts from RootMask. SmallVector<int, 64> OpMask; SmallVector<SDValue, 2> OpInputs; - if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) + APInt OpUndef, OpZero; + APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); + if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, + OpZero, DAG, Depth, false)) return SDValue(); + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); + // Add the inputs to the Ops list, avoiding duplicates. SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end()); @@ -32772,6 +33158,9 @@ static SDValue combineX86ShufflesRecursively( Mask[i] = OpMaskedIdx; } + // Remove unused/repeated shuffle source ops. + resolveTargetShuffleInputsAndMask(Ops, Mask); + // Handle the all undef/zero cases early. if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) return DAG.getUNDEF(Root.getValueType()); @@ -32783,11 +33172,8 @@ static SDValue combineX86ShufflesRecursively( return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, SDLoc(Root)); - // Remove unused/repeated shuffle source ops. - resolveTargetShuffleInputsAndMask(Ops, Mask); assert(!Ops.empty() && "Shuffle with no inputs detected"); - - HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode()); + HasVariableMask |= IsOpVariableMask; // Update the list of shuffle nodes that have been combined so far. SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(), @@ -32853,7 +33239,7 @@ static SDValue combineX86ShufflesRecursively( /// Helper entry wrapper to combineX86ShufflesRecursively. static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, + return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget); } @@ -33088,7 +33474,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, for (unsigned i = 0; i != Scale; ++i) DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( - {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1, + {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -33120,6 +33506,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, VT.getSizeInBits()); } + // vbroadcast(scalarload X) -> vbroadcast_load X + // For float loads, extract other uses of the scalar from the broadcast. + if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) && + ISD::isNormalLoad(Src.getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(Src); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceExtract = Src.hasOneUse(); + DCI.CombineTo(N.getNode(), BcastLd); + if (NoReplaceExtract) { + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + } else { + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd, + DAG.getIntPtrConstant(0, DL)); + DCI.CombineTo(LN, Scl, BcastLd.getValue(1)); + } + return N; // Return N so it doesn't get rechecked! + } + return SDValue(); } case X86ISD::BLENDI: { @@ -33133,14 +33543,14 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, MVT SrcVT = N0.getOperand(0).getSimpleValueType(); if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && SrcVT.getScalarSizeInBits() >= 32) { - unsigned Mask = N.getConstantOperandVal(2); + unsigned BlendMask = N.getConstantOperandVal(2); unsigned Size = VT.getVectorNumElements(); unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); - unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale); + BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), N1.getOperand(0), - DAG.getConstant(ScaleMask, DL, MVT::i8))); + DAG.getTargetConstant(BlendMask, DL, MVT::i8))); } } return SDValue(); @@ -33208,76 +33618,97 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, // If we zero out all elements from Op0 then we don't need to reference it. if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); // If we zero out the element from Op1 then we don't need to reference it. if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector<int, 8> TargetMask1; SmallVector<SDValue, 2> Ops1; - if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { - int M = TargetMask1[SrcIdx]; - if (isUndefOrZero(M)) { + APInt KnownUndef1, KnownZero1; + if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1, + KnownZero1)) { + if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) { // Zero/UNDEF insertion - zero out element and remove dependency. InsertPSMask |= (1u << DstIdx); return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Update insertps mask srcidx and reference the source input directly. + int M = TargetMask1[SrcIdx]; assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector<int, 8> TargetMask0; SmallVector<SDValue, 2> Ops0; - if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) - return SDValue(); + APInt KnownUndef0, KnownZero0; + if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0, + KnownZero0)) { + bool Updated = false; + bool UseInput00 = false; + bool UseInput01 = false; + for (int i = 0; i != 4; ++i) { + if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { + // No change if element is already zero or the inserted element. + continue; + } else if (KnownUndef0[i] || KnownZero0[i]) { + // If the target mask is undef/zero then we must zero the element. + InsertPSMask |= (1u << i); + Updated = true; + continue; + } - bool Updated = false; - bool UseInput00 = false; - bool UseInput01 = false; - for (int i = 0; i != 4; ++i) { - int M = TargetMask0[i]; - if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { - // No change if element is already zero or the inserted element. - continue; - } else if (isUndefOrZero(M)) { - // If the target mask is undef/zero then we must zero the element. - InsertPSMask |= (1u << i); - Updated = true; - continue; + // The input vector element must be inline. + int M = TargetMask0[i]; + if (M != i && M != (i + 4)) + return SDValue(); + + // Determine which inputs of the target shuffle we're using. + UseInput00 |= (0 <= M && M < 4); + UseInput01 |= (4 <= M); } - // The input vector element must be inline. - if (M != i && M != (i + 4)) - return SDValue(); + // If we're not using both inputs of the target shuffle then use the + // referenced input directly. + if (UseInput00 && !UseInput01) { + Updated = true; + Op0 = Ops0[0]; + } else if (!UseInput00 && UseInput01) { + Updated = true; + Op0 = Ops0[1]; + } - // Determine which inputs of the target shuffle we're using. - UseInput00 |= (0 <= M && M < 4); - UseInput01 |= (4 <= M); + if (Updated) + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } - // If we're not using both inputs of the target shuffle then use the - // referenced input directly. - if (UseInput00 && !UseInput01) { - Updated = true; - Op0 = Ops0[0]; - } else if (!UseInput00 && UseInput01) { - Updated = true; - Op0 = Ops0[1]; + // If we're inserting an element from a vbroadcast load, fold the + // load into the X86insertps instruction. We need to convert the scalar + // load to a vector and clear the source lane of the INSERTPS control. + if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op1); + if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { + SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getMemOperand()); + SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, + Load), + DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Insert; + } } - if (Updated) - return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, - DAG.getConstant(InsertPSMask, DL, MVT::i8)); - return SDValue(); } default: @@ -33580,7 +34011,7 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, } /// Eliminate a redundant shuffle of a horizontal math op. -static SDValue foldShuffleOfHorizOp(SDNode *N) { +static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST) if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) @@ -33611,17 +34042,36 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { HOp.getOperand(0) != HOp.getOperand(1)) return SDValue(); + // The shuffle that we are eliminating may have allowed the horizontal op to + // have an undemanded (undefined) operand. Duplicate the other (defined) + // operand to ensure that the results are defined across all lanes without the + // shuffle. + auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) { + SDValue X; + if (HorizOp.getOperand(0).isUndef()) { + assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"); + X = HorizOp.getOperand(1); + } else if (HorizOp.getOperand(1).isUndef()) { + assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"); + X = HorizOp.getOperand(0); + } else { + return HorizOp; + } + return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp), + HorizOp.getValueType(), X, X); + }; + // When the operands of a horizontal math op are identical, the low half of // the result is the same as the high half. If a target shuffle is also - // replicating low and high halves, we don't need the shuffle. + // replicating low and high halves (and without changing the type/length of + // the vector), we don't need the shuffle. if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) { - if (HOp.getScalarValueSizeInBits() == 64) { + if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) { // movddup (hadd X, X) --> hadd X, X // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X assert((HOp.getValueType() == MVT::v2f64 || - HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT && - "Unexpected type for h-op"); - return HOp; + HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op"); + return updateHOp(HOp, DAG); } return SDValue(); } @@ -33635,14 +34085,14 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) { (isTargetShuffleEquivalent(Mask, {0, 0}) || isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) || isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}))) - return HOp; + return updateHOp(HOp, DAG); if (HOp.getValueSizeInBits() == 256 && (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) || isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) || isTargetShuffleEquivalent( Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11}))) - return HOp; + return updateHOp(HOp, DAG); return SDValue(); } @@ -33677,7 +34127,7 @@ static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { // the wide shuffle that we started with. return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), Shuf->getOperand(1), HalfMask, HalfIdx1, - HalfIdx2, false, DAG); + HalfIdx2, false, DAG, /*UseConcat*/true); } static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, @@ -33696,70 +34146,10 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; - if (SDValue HAddSub = foldShuffleOfHorizOp(N)) + if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG)) return HAddSub; } - // During Type Legalization, when promoting illegal vector types, - // the backend might introduce new shuffle dag nodes and bitcasts. - // - // This code performs the following transformation: - // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) -> - // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>) - // - // We do this only if both the bitcast and the BINOP dag nodes have - // one use. Also, perform this transformation only if the new binary - // operation is legal. This is to avoid introducing dag nodes that - // potentially need to be further expanded (or custom lowered) into a - // less optimal sequence of dag nodes. - if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() && - N->getOpcode() == ISD::VECTOR_SHUFFLE && - N->getOperand(0).getOpcode() == ISD::BITCAST && - N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - SDValue BC0 = N0.getOperand(0); - EVT SVT = BC0.getValueType(); - unsigned Opcode = BC0.getOpcode(); - unsigned NumElts = VT.getVectorNumElements(); - - if (BC0.hasOneUse() && SVT.isVector() && - SVT.getVectorNumElements() * 2 == NumElts && - TLI.isOperationLegal(Opcode, VT)) { - bool CanFold = false; - switch (Opcode) { - default : break; - case ISD::ADD: - case ISD::SUB: - case ISD::MUL: - // isOperationLegal lies for integer ops on floating point types. - CanFold = VT.isInteger(); - break; - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - // isOperationLegal lies for floating point ops on integer types. - CanFold = VT.isFloatingPoint(); - break; - } - - unsigned SVTNumElts = SVT.getVectorNumElements(); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i) - CanFold = SVOp->getMaskElt(i) == (int)(i * 2); - for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i) - CanFold = SVOp->getMaskElt(i) < 0; - - if (CanFold) { - SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0)); - SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1)); - SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01); - return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask()); - } - } - } - // Attempt to combine into a vector load/broadcast. if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) return LD; @@ -33841,7 +34231,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && ISD::isNormalLoad(N->getOperand(0).getNode())) { LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - if (!LN->isVolatile()) { + if (LN->isSimple()) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; SDValue VZLoad = @@ -33855,53 +34245,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } } - - // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the - // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. - // FIXME: This can probably go away once we default to widening legalization. - if (Subtarget.hasSSE41() && VT == MVT::v4i32 && - N->getOpcode() == ISD::VECTOR_SHUFFLE && - N->getOperand(0).getOpcode() == ISD::BITCAST && - N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) { - SDValue BC = N->getOperand(0); - SDValue MULUDQ = BC.getOperand(0); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - ArrayRef<int> Mask = SVOp->getMask(); - if (BC.hasOneUse() && MULUDQ.hasOneUse() && - Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) { - SDValue Op0 = MULUDQ.getOperand(0); - SDValue Op1 = MULUDQ.getOperand(1); - if (Op0.getOpcode() == ISD::BITCAST && - Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && - Op0.getOperand(0).getValueType() == MVT::v4i32) { - ShuffleVectorSDNode *SVOp0 = - cast<ShuffleVectorSDNode>(Op0.getOperand(0)); - ArrayRef<int> Mask2 = SVOp0->getMask(); - if (Mask2[0] == 0 && Mask2[1] == -1 && - Mask2[2] == 1 && Mask2[3] == -1) { - Op0 = SVOp0->getOperand(0); - Op1 = DAG.getBitcast(MVT::v4i32, Op1); - Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask); - return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); - } - } - if (Op1.getOpcode() == ISD::BITCAST && - Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && - Op1.getOperand(0).getValueType() == MVT::v4i32) { - ShuffleVectorSDNode *SVOp1 = - cast<ShuffleVectorSDNode>(Op1.getOperand(0)); - ArrayRef<int> Mask2 = SVOp1->getMask(); - if (Mask2[0] == 0 && Mask2[1] == -1 && - Mask2[2] == 1 && Mask2[3] == -1) { - Op0 = DAG.getBitcast(MVT::v4i32, Op0); - Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask); - Op1 = SVOp1->getOperand(0); - return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); - } - } - } - } - return SDValue(); } @@ -33966,6 +34309,84 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // TODO convert SrcUndef to KnownUndef. break; } + case X86ISD::KSHIFTL: { + SDValue Src = Op.getOperand(0); + auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); + assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"); + unsigned ShiftAmt = Amt->getZExtValue(); + + if (ShiftAmt == 0) + return TLO.CombineTo(Op, Src); + + // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a + // single shift. We can do this if the bottom bits (which are shifted + // out) are never demanded. + if (Src.getOpcode() == X86ISD::KSHIFTR) { + if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) { + unsigned C1 = Src.getConstantOperandVal(1); + unsigned NewOpc = X86ISD::KSHIFTL; + int Diff = ShiftAmt - C1; + if (Diff < 0) { + Diff = -Diff; + NewOpc = X86ISD::KSHIFTR; + } + + SDLoc dl(Op); + SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); + return TLO.CombineTo( + Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); + } + } + + APInt DemandedSrc = DemandedElts.lshr(ShiftAmt); + if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, + Depth + 1)) + return true; + + KnownUndef <<= ShiftAmt; + KnownZero <<= ShiftAmt; + KnownZero.setLowBits(ShiftAmt); + break; + } + case X86ISD::KSHIFTR: { + SDValue Src = Op.getOperand(0); + auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); + assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"); + unsigned ShiftAmt = Amt->getZExtValue(); + + if (ShiftAmt == 0) + return TLO.CombineTo(Op, Src); + + // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a + // single shift. We can do this if the top bits (which are shifted + // out) are never demanded. + if (Src.getOpcode() == X86ISD::KSHIFTL) { + if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) { + unsigned C1 = Src.getConstantOperandVal(1); + unsigned NewOpc = X86ISD::KSHIFTR; + int Diff = ShiftAmt - C1; + if (Diff < 0) { + Diff = -Diff; + NewOpc = X86ISD::KSHIFTL; + } + + SDLoc dl(Op); + SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); + return TLO.CombineTo( + Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); + } + } + + APInt DemandedSrc = DemandedElts.shl(ShiftAmt); + if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, + Depth + 1)) + return true; + + KnownUndef.lshrInPlace(ShiftAmt); + KnownZero.lshrInPlace(ShiftAmt); + KnownZero.setHighBits(ShiftAmt); + break; + } case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: { SDValue Src = Op.getOperand(0); @@ -33979,16 +34400,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } case X86ISD::PACKSS: case X86ISD::PACKUS: { + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + APInt DemandedLHS, DemandedRHS; getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); APInt SrcUndef, SrcZero; - if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef, - SrcZero, TLO, Depth + 1)) + if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO, + Depth + 1)) return true; - if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef, - SrcZero, TLO, Depth + 1)) + if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO, + Depth + 1)) return true; + + // Aggressively peek through ops to get at the demanded elts. + // TODO - we should do this for all target/faux shuffles ops. + if (!DemandedElts.isAllOnesValue()) { + APInt DemandedSrcBits = + APInt::getAllOnesValue(N0.getScalarValueSizeInBits()); + SDValue NewN0 = SimplifyMultipleUseDemandedBits( + N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1); + SDValue NewN1 = SimplifyMultipleUseDemandedBits( + N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1); + if (NewN0 || NewN1) { + NewN0 = NewN0 ? NewN0 : N0; + NewN1 = NewN1 ? NewN1 : N1; + return TLO.CombineTo(Op, + TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1)); + } + } break; } case X86ISD::HADD: @@ -34062,25 +34503,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return true; break; } - case X86ISD::SUBV_BROADCAST: { - // Reduce size of broadcast if we don't need the upper half. - unsigned HalfElts = NumElts / 2; - if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) { - SDValue Src = Op.getOperand(0); - MVT SrcVT = Src.getSimpleValueType(); - - SDValue Half = Src; - if (SrcVT.getVectorNumElements() != HalfElts) { - MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts); - Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src); - } - - return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0, - TLO.DAG, SDLoc(Op), - Half.getValueSizeInBits())); - } - break; - } case X86ISD::VPERMV: { SDValue Mask = Op.getOperand(0); APInt MaskUndef, MaskZero; @@ -34135,6 +34557,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); return TLO.CombineTo(Op, Insert); } + // Subvector broadcast. + case X86ISD::SUBV_BROADCAST: { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + if (Src.getValueSizeInBits() > ExtSizeInBits) + Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); + else if (Src.getValueSizeInBits() < ExtSizeInBits) { + MVT SrcSVT = Src.getSimpleValueType().getScalarType(); + MVT SrcVT = + MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits()); + Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src); + } + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, + TLO.DAG, DL, ExtSizeInBits)); + } // Byte shifts by immediate. case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -34201,36 +34638,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } } - // Simplify target shuffles. - if (!isTargetShuffle(Opc) || !VT.isSimple()) - return false; - - // Get target shuffle mask. - bool IsUnary; + // Get target/faux shuffle mask. + APInt OpUndef, OpZero; SmallVector<int, 64> OpMask; SmallVector<SDValue, 2> OpInputs; - if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs, - OpMask, IsUnary)) + if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef, + OpZero, TLO.DAG, Depth, false)) return false; - // Shuffle inputs must be the same type as the result. - if (llvm::any_of(OpInputs, - [VT](SDValue V) { return VT != V.getValueType(); })) + // Shuffle inputs must be the same size as the result. + if (OpMask.size() != (unsigned)NumElts || + llvm::any_of(OpInputs, [VT](SDValue V) { + return VT.getSizeInBits() != V.getValueSizeInBits() || + !V.getValueType().isVector(); + })) return false; - // Clear known elts that might have been set above. - KnownZero.clearAllBits(); - KnownUndef.clearAllBits(); + KnownZero = OpZero; + KnownUndef = OpUndef; // Check if shuffle mask can be simplified to undef/zero/identity. int NumSrcs = OpInputs.size(); - for (int i = 0; i != NumElts; ++i) { - int &M = OpMask[i]; + for (int i = 0; i != NumElts; ++i) if (!DemandedElts[i]) - M = SM_SentinelUndef; - else if (0 <= M && OpInputs[M / NumElts].isUndef()) - M = SM_SentinelUndef; - } + OpMask[i] = SM_SentinelUndef; if (isUndefInRange(OpMask, 0, NumElts)) { KnownUndef.setAllBits(); @@ -34243,10 +34674,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } for (int Src = 0; Src != NumSrcs; ++Src) if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) - return TLO.CombineTo(Op, OpInputs[Src]); + return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src])); // Attempt to simplify inputs. for (int Src = 0; Src != NumSrcs; ++Src) { + // TODO: Support inputs of different types. + if (OpInputs[Src].getValueType() != VT) + continue; + int Lo = Src * NumElts; APInt SrcElts = APInt::getNullValue(NumElts); for (int i = 0; i != NumElts; ++i) @@ -34256,21 +34691,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SrcElts.setBit(M); } + // TODO - Propagate input undef/zero elts. APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; } - // Extract known zero/undef elements. - // TODO - Propagate input undef/zero elts. - for (int i = 0; i != NumElts; ++i) { - if (OpMask[i] == SM_SentinelUndef) - KnownUndef.setBit(i); - if (OpMask[i] == SM_SentinelZero) - KnownZero.setBit(i); - } - return false; } @@ -34296,6 +34723,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp, TLO, Depth + 1)) return true; + + // Aggressively peek through ops to get at the demanded low bits. + SDValue DemandedLHS = SimplifyMultipleUseDemandedBits( + LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + SDValue DemandedRHS = SimplifyMultipleUseDemandedBits( + RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1); + if (DemandedLHS || DemandedRHS) { + DemandedLHS = DemandedLHS ? DemandedLHS : LHS; + DemandedRHS = DemandedRHS ? DemandedRHS : RHS; + return TLO.CombineTo( + Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS)); + } break; } case X86ISD::VSHLI: { @@ -34323,7 +34762,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; SDValue NewShift = TLO.DAG.getNode( NewOpc, SDLoc(Op), VT, Op0.getOperand(0), - TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); + TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); return TLO.CombineTo(Op, NewShift); } } @@ -34441,6 +34880,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( KnownVec, TLO, Depth + 1)) return true; + if (SDValue V = SimplifyMultipleUseDemandedBits( + Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo( + Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1))); + Known = KnownVec.zext(BitWidth, true); return false; } @@ -34542,12 +34986,80 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } +SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + SelectionDAG &DAG, unsigned Depth) const { + int NumElts = DemandedElts.getBitWidth(); + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + + switch (Opc) { + case X86ISD::PINSRB: + case X86ISD::PINSRW: { + // If we don't demand the inserted element, return the base vector. + SDValue Vec = Op.getOperand(0); + auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + MVT VecVT = Vec.getSimpleValueType(); + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) && + !DemandedElts[CIdx->getZExtValue()]) + return Vec; + break; + } + } + + APInt ShuffleUndef, ShuffleZero; + SmallVector<int, 16> ShuffleMask; + SmallVector<SDValue, 2> ShuffleOps; + if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask, + ShuffleUndef, ShuffleZero, DAG, Depth, false)) { + // If all the demanded elts are from one operand and are inline, + // then we can use the operand directly. + int NumOps = ShuffleOps.size(); + if (ShuffleMask.size() == (unsigned)NumElts && + llvm::all_of(ShuffleOps, [VT](SDValue V) { + return VT.getSizeInBits() == V.getValueSizeInBits(); + })) { + + if (DemandedElts.isSubsetOf(ShuffleUndef)) + return DAG.getUNDEF(VT); + if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero)) + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op)); + + // Bitmask that indicates which ops have only been accessed 'inline'. + APInt IdentityOp = APInt::getAllOnesValue(NumOps); + for (int i = 0; i != NumElts; ++i) { + int M = ShuffleMask[i]; + if (!DemandedElts[i] || ShuffleUndef[i]) + continue; + int Op = M / NumElts; + int Index = M % NumElts; + if (M < 0 || Index != i) { + IdentityOp.clearAllBits(); + break; + } + IdentityOp &= APInt::getOneBitSet(NumOps, Op); + if (IdentityOp == 0) + break; + } + assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) && + "Multiple identity shuffles detected"); + + if (IdentityOp != 0) + return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]); + } + } + + return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( + Op, DemandedBits, DemandedElts, DAG, Depth); +} + /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but /// shuffles have been custom lowered so we need to handle those here. -static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -34559,13 +35071,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT OriginalVT = InVec.getValueType(); + unsigned NumOriginalElts = OriginalVT.getVectorNumElements(); // Peek through bitcasts, don't duplicate a load with other uses. InVec = peekThroughOneUseBitcasts(InVec); EVT CurrentVT = InVec.getValueType(); - if (!CurrentVT.isVector() || - CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) + if (!CurrentVT.isVector()) + return SDValue(); + + unsigned NumCurrentElts = CurrentVT.getVectorNumElements(); + if ((NumOriginalElts % NumCurrentElts) != 0) return SDValue(); if (!isTargetShuffle(InVec.getOpcode())) @@ -34582,10 +35098,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); + unsigned Scale = NumOriginalElts / NumCurrentElts; + if (Scale > 1) { + SmallVector<int, 16> ScaledMask; + scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask); + ShuffleMask = std::move(ScaledMask); + } + assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch"); + // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt]; if (Idx == SM_SentinelZero) return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) @@ -34598,8 +35121,9 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) return SDValue(); - assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1]; + assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) && + "Shuffle index out of range"); + SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses unsigned AllowedUses = @@ -34619,7 +35143,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, LoadSDNode *LN0 = cast<LoadSDNode>(LdNode); - if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) + if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple()) return SDValue(); // If there's a bitcast before the shuffle, check if the load type and @@ -34637,10 +35161,11 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; - Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle, - ShuffleMask); - Shuffle = DAG.getBitcast(OriginalVT, Shuffle); + SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT) + : DAG.getBitcast(OriginalVT, ShuffleOps[1]); + Shuffle = DAG.getVectorShuffle(OriginalVT, dl, + DAG.getBitcast(OriginalVT, ShuffleOps[0]), + Shuffle, ShuffleMask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } @@ -34660,6 +35185,23 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { return false; } +// Helper to push sign extension of vXi1 SETCC result through bitops. +static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, + SDValue Src, const SDLoc &DL) { + switch (Src.getOpcode()) { + case ISD::SETCC: + return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + case ISD::AND: + case ISD::XOR: + case ISD::OR: + return DAG.getNode( + Src.getOpcode(), DL, SExtVT, + signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL), + signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL)); + } + llvm_unreachable("Unexpected node type for vXi1 sign extension"); +} + // Try to match patterns such as // (i16 bitcast (v16i1 x)) // -> @@ -34698,6 +35240,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; + bool PropagateSExt = false; switch (SrcVT.getSimpleVT().SimpleTy) { default: return SDValue(); @@ -34708,8 +35251,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, SExtVT = MVT::v4i32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) + if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) { SExtVT = MVT::v4i64; + PropagateSExt = true; + } break; case MVT::v8i1: SExtVT = MVT::v8i16; @@ -34718,11 +35263,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - // TODO : use checkBitcastSrcVectorSize - if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (Src.getOperand(0).getValueType().is256BitVector() || - Src.getOperand(0).getValueType().is512BitVector())) { + if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) || + checkBitcastSrcVectorSize(Src, 512))) { SExtVT = MVT::v8i32; + PropagateSExt = true; } break; case MVT::v16i1: @@ -34745,19 +35289,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, return SDValue(); }; - SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); + SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) + : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); - if (SExtVT == MVT::v64i8) { - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(V, DL); - Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); - Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); - Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); - Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); - Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, - DAG.getConstant(32, DL, MVT::i8)); - V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); - } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) { + if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) { V = getPMOVMSKB(DL, V, DAG, Subtarget); } else { if (SExtVT == MVT::v8i16) @@ -34891,8 +35426,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned ShufMask = (NumElts > 2 ? 0 : 0x44); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, - DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat, - DAG.getConstant(ShufMask, DL, MVT::i8)); + DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), + Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8)); } Ops.append(NumElts, Splat); } else { @@ -34935,6 +35470,24 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; + // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type + // legalization destroys the v4i32 type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 && + VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC && + N0.getOperand(0).getValueType() == MVT::v4i32 && + ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && + cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) { + SDValue N00 = N0.getOperand(0); + // Only do this if we can avoid scalarizing the input. + if (ISD::isNormalLoad(N00.getNode()) || + (N00.getOpcode() == ISD::BITCAST && + N00.getOperand(0).getValueType() == MVT::v4f32)) { + SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, + DAG.getBitcast(MVT::v4f32, N00)); + return DAG.getZExtOrTrunc(V, dl, VT); + } + } + // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && @@ -34949,6 +35502,26 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // type, widen both sides to avoid a trip through memory. if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && Subtarget.hasAVX512()) { + // Use zeros for the widening if we already have some zeroes. This can + // allow SimplifyDemandedBits to remove scalar ANDs that may be down + // stream of this. + // FIXME: It might make sense to detect a concat_vectors with a mix of + // zeroes and undef and turn it into insert_subvector for i1 vectors as + // a separate combine. What we can't do is canonicalize the operands of + // such a concat or we'll get into a loop with SimplifyDemandedBits. + if (N0.getOpcode() == ISD::CONCAT_VECTORS) { + SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1); + if (ISD::isBuildVectorAllZeros(LastOp.getNode())) { + SrcVT = LastOp.getValueType(); + unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); + SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end()); + Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT)); + N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); + N0 = DAG.getBitcast(MVT::i8, N0); + return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); + } + } + unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT)); Ops[0] = N0; @@ -34958,6 +35531,33 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, } } + // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and + // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur + // due to insert_subvector legalization on KNL. By promoting the copy to i16 + // we can help with known bits propagation from the vXi1 domain to the + // scalar domain. + if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() && + !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == MVT::v16i1 && + isNullConstant(N0.getOperand(1))) + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, + DAG.getBitcast(MVT::i16, N0.getOperand(0))); + + // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT + // determines // the number of bits loaded. Remaining bits are zero. + if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && + VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) { + auto *BCast = cast<MemIntrinsicSDNode>(N0); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + VT.getVectorElementType(), + BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return ResNode; + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. @@ -35152,7 +35752,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. ISD::NodeType BinOp; SDValue Src = DAG.matchBinOpReduction( - Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); + Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true); if (!Src) return SDValue(); @@ -35246,29 +35846,31 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, SDLoc DL(Extract); EVT MatchVT = Match.getValueType(); unsigned NumElts = MatchVT.getVectorNumElements(); + unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (ExtractVT == MVT::i1) { // Special case for (pre-legalization) vXi1 reductions. - if (NumElts > 32) + if (NumElts > 64 || !isPowerOf2_32(NumElts)) return SDValue(); - if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) { + if (TLI.isTypeLegal(MatchVT)) { // If this is a legal AVX512 predicate type then we can just bitcast. EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { // Use combineBitcastvxi1 to create the MOVMSK. - if (NumElts == 32 && !Subtarget.hasInt256()) { + while (NumElts > MaxElts) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); - NumElts = 16; + NumElts /= 2; } EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); } if (!Movmsk) return SDValue(); - Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32); + Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); } else { // Bail with AVX512VL (which uses predicate registers). if (Subtarget.hasVLX()) @@ -35309,13 +35911,15 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); NumElts = MaskSrcVT.getVectorNumElements(); } - assert(NumElts <= 32 && "Not expecting more than 32 elements"); + assert((NumElts <= 32 || NumElts == 64) && + "Not expecting more than 64 elements"); + MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32; if (BinOp == ISD::XOR) { // parity -> (AND (CTPOP(MOVMSK X)), 1) - SDValue Mask = DAG.getConstant(1, DL, MVT::i32); - SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk); - Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask); + SDValue Mask = DAG.getConstant(1, DL, CmpVT); + SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk); + Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask); return DAG.getZExtOrTrunc(Result, DL, ExtractVT); } @@ -35323,19 +35927,19 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, ISD::CondCode CondCode; if (BinOp == ISD::OR) { // any_of -> MOVMSK != 0 - CmpC = DAG.getConstant(0, DL, MVT::i32); + CmpC = DAG.getConstant(0, DL, CmpVT); CondCode = ISD::CondCode::SETNE; } else { // all_of -> MOVMSK == ((1 << NumElts) - 1) - CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32); + CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts), + DL, CmpVT); CondCode = ISD::CondCode::SETEQ; } // The setcc produces an i8 of 0/1, so extend that to the result width and // negate to get the final 0/-1 mask value. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetccVT = - TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT); SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); SDValue Zero = DAG.getConstant(0, DL, ExtractVT); @@ -35431,6 +36035,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDLoc dl(N); SDValue Src = N->getOperand(0); SDValue Idx = N->getOperand(1); @@ -35452,10 +36057,37 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, SrcOp); } + // If we're extracting a single element from a broadcast load and there are + // no other users, just create a single load. + if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC); + unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); + if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && + VT.getSizeInBits() == SrcBCWidth) { + SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getPointerInfo(), + MemIntr->getAlignment(), + MemIntr->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Load; + } + } + + // Handle extract(truncate(x)) for 0'th index. + // TODO: Treat this as a faux shuffle? + // TODO: When can we use this for general indices? + if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && + isNullConstant(Idx)) { + Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); + Src = DAG.getBitcast(SrcVT, Src); + return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx); + } + // Resolve the target shuffle inputs and mask. SmallVector<int, 16> Mask; SmallVector<SDValue, 2> Ops; - if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) + if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. @@ -35489,7 +36121,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); int SrcIdx = Mask[N->getConstantOperandVal(1)]; - SDLoc dl(N); // If the shuffle source element is undef/zero then we can just accept it. if (SrcIdx == SM_SentinelUndef) @@ -35584,7 +36215,7 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { } // TODO: This switch could include FNEG and the x86-specific FP logic ops - // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid + // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid // missed load folding and fma+fneg combining. switch (Vec.getOpcode()) { case ISD::FMA: // Begin 3 operands @@ -35631,27 +36262,84 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); - if (!Subtarget.hasFastHorizontalOps() && !OptForSize) - return SDValue(); - SDValue Index = ExtElt->getOperand(1); - if (!isNullConstant(Index)) - return SDValue(); - // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. ISD::NodeType Opc; - SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); + SDValue Rdx = + DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true); if (!Rdx) return SDValue(); + SDValue Index = ExtElt->getOperand(1); + assert(isNullConstant(Index) && + "Reduction doesn't end in an extract from index 0"); + EVT VT = ExtElt->getValueType(0); - EVT VecVT = ExtElt->getOperand(0).getValueType(); + EVT VecVT = Rdx.getValueType(); if (VecVT.getScalarType() != VT) return SDValue(); - unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; SDLoc DL(ExtElt); + // vXi8 reduction - sub 128-bit vector. + if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) { + if (VecVT == MVT::v4i8) { + // Pad with zero. + if (Subtarget.hasSSE41()) { + Rdx = DAG.getBitcast(MVT::i32, Rdx); + Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, + DAG.getConstant(0, DL, MVT::v4i32), Rdx, + DAG.getIntPtrConstant(0, DL)); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + } else { + Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx, + DAG.getConstant(0, DL, VecVT)); + } + } + if (Rdx.getValueType() == MVT::v8i8) { + // Pad with undef. + Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx, + DAG.getUNDEF(MVT::v8i8)); + } + Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, + DAG.getConstant(0, DL, MVT::v16i8)); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); + } + + // Must be a >=128-bit vector with pow2 elements. + if ((VecVT.getSizeInBits() % 128) != 0 || + !isPowerOf2_32(VecVT.getVectorNumElements())) + return SDValue(); + + // vXi8 reduction - sum lo/hi halves then use PSADBW. + if (VT == MVT::i8) { + while (Rdx.getValueSizeInBits() > 128) { + unsigned HalfSize = VecVT.getSizeInBits() / 2; + unsigned HalfElts = VecVT.getVectorNumElements() / 2; + SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize); + SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize); + Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi); + VecVT = Rdx.getValueType(); + } + assert(VecVT == MVT::v16i8 && "v16i8 reduction expected"); + + SDValue Hi = DAG.getVectorShuffle( + MVT::v16i8, DL, Rdx, Rdx, + {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); + Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi); + Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, + getZeroVector(MVT::v16i8, Subtarget, DAG, DL)); + Rdx = DAG.getBitcast(MVT::v16i8, Rdx); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); + } + + // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + return SDValue(); + + unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; + // 256-bit horizontal instructions operate on 128-bit chunks rather than // across the whole vector, so we need an extract + hop preliminary stage. // This is the only step where the operands of the hop are not the same value. @@ -35661,15 +36349,14 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, unsigned NumElts = VecVT.getVectorNumElements(); SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); - VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); - Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); + Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo); + VecVT = Rdx.getValueType(); } if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) return SDValue(); // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 - assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); for (unsigned i = 0; i != ReductionSteps; ++i) Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); @@ -35714,15 +36401,26 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } } - // TODO - Remove this once we can handle the implicit zero-extension of - // X86ISD::PEXTRW/X86ISD::PEXTRB in: - // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and - // combineBasicSADPattern. if (IsPextr) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits( SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI)) return SDValue(N, 0); + + // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling). + if ((InputVector.getOpcode() == X86ISD::PINSRB || + InputVector.getOpcode() == X86ISD::PINSRW) && + InputVector.getOperand(2) == EltIdx) { + assert(SrcVT == InputVector.getOperand(0).getValueType() && + "Vector type mismatch"); + SDValue Scl = InputVector.getOperand(1); + Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl); + return DAG.getZExtOrTrunc(Scl, dl, VT); + } + + // TODO - Remove this once we can handle the implicit zero-extension of + // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad, + // combineHorizontalPredicateResult and combineBasicSADPattern. return SDValue(); } @@ -35832,6 +36530,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, // get simplified at node creation time)? bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + // If both inputs are 0/undef, create a complete zero vector. + // FIXME: As noted above this should be handled by DAGCombiner/getNode. + if (TValIsAllZeros && FValIsAllZeros) { + if (VT.isFloatingPoint()) + return DAG.getConstantFP(0.0, DL, VT); + return DAG.getConstant(0, DL, VT); + } + if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) @@ -36295,8 +37002,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // Since SKX these selects have a proper lowering. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1 && - (ExperimentalVectorWideningLegalization || - VT.getVectorNumElements() > 4) && (VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16)) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); @@ -36358,6 +37063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // subl %esi, $edi // cmovsl %eax, %edi if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && + Cond.hasOneUse() && DAG.isEqualTo(LHS, Cond.getOperand(0)) && DAG.isEqualTo(RHS, Cond.getOperand(1))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -36508,6 +37214,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) return V; + // select(~Cond, X, Y) -> select(Cond, Y, X) + if (CondVT.getScalarType() != MVT::i1) + if (SDValue CondNot = IsNOT(Cond, DAG)) + return DAG.getNode(N->getOpcode(), DL, VT, + DAG.getBitcast(CondVT, CondNot), RHS, LHS); + // Custom action for SELECT MMX if (VT == MVT::x86mmx) { LHS = DAG.getBitcast(MVT::i64, LHS); @@ -36873,8 +37585,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { - SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), - Flags}; + SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8), + Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } @@ -36923,12 +37635,13 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Optimize cases that will turn into an LEA instruction. This requires // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { - uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); - if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); + assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && + "Implicit constant truncation"); bool isFastMultiplier = false; - if (Diff < 10) { - switch ((unsigned char)Diff) { + if (Diff.ult(10)) { + switch (Diff.getZExtValue()) { default: break; case 1: // result = add base, cond case 2: // result = lea base( , cond*2) @@ -36943,7 +37656,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, } if (isFastMultiplier) { - APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); Cond = getSETCC(CC, Cond, DL ,DAG); // Zero extend the condition if needed. Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), @@ -36994,8 +37706,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { - SDValue Ops[] = { FalseOp, Cond.getOperand(0), - DAG.getConstant(CC, DL, MVT::i8), Cond }; + SDValue Ops[] = {FalseOp, Cond.getOperand(0), + DAG.getTargetConstant(CC, DL, MVT::i8), Cond}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } @@ -37029,10 +37741,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, CC1 = X86::GetOppositeBranchCondition(CC1); } - SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), - Flags}; + SDValue LOps[] = {FalseOp, TrueOp, + DAG.getTargetConstant(CC0, DL, MVT::i8), Flags}; SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); - SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; + SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8), + Flags}; SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); return CMOV; } @@ -37064,9 +37777,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // This should constant fold. SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); - SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), - DAG.getConstant(X86::COND_NE, DL, MVT::i8), - Cond); + SDValue CMov = + DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), + DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond); return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); } } @@ -37166,98 +37879,45 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, if ((NumElts % 2) != 0) return SDValue(); - unsigned RegSize = 128; - MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - if (ExperimentalVectorWideningLegalization || - NumElts >= OpsVT.getVectorNumElements()) { - // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the - // lower part is needed. - SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); - if (Mode == MULU8 || Mode == MULS8) - return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, - DL, VT, MulLo); - - MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); - // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, - // the higher part is also needed. - SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - ReducedVT, NewN0, NewN1); - - // Repack the lower part and higher part result of mul into a wider - // result. - // Generate shuffle functioning as punpcklwd. - SmallVector<int, 16> ShuffleMask(NumElts); - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i; - ShuffleMask[2 * i + 1] = i + NumElts; - } - SDValue ResLo = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResLo = DAG.getBitcast(ResVT, ResLo); - // Generate shuffle functioning as punpckhwd. - for (unsigned i = 0, e = NumElts / 2; i < e; i++) { - ShuffleMask[2 * i] = i + NumElts / 2; - ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; - } - SDValue ResHi = - DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResHi = DAG.getBitcast(ResVT, ResHi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); - } - - // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want - // to legalize the mul explicitly because implicit legalization for type - // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack - // instructions which will not exist when we explicitly legalize it by - // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with - // <4 x i16> undef). - // - // Legalize the operands of mul. - // FIXME: We may be able to handle non-concatenated vectors by insertion. - unsigned ReducedSizeInBits = ReducedVT.getSizeInBits(); - if ((RegSize % ReducedSizeInBits) != 0) - return SDValue(); - - SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits, - DAG.getUNDEF(ReducedVT)); - Ops[0] = NewN0; - NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); - Ops[0] = NewN1; - NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops); - - if (Mode == MULU8 || Mode == MULS8) { - // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower - // part is needed. - SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); - - // convert the type of mul result to VT. - MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); - SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG, - DL, ResVT, Mul); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); - } + // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the + // lower part is needed. + SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); + if (Mode == MULU8 || Mode == MULS8) + return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, + DL, VT, MulLo); - // Generate the lower and higher part of mul: pmulhw/pmulhuw. For - // MULU16/MULS16, both parts are needed. - SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1); + MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); + // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, + // the higher part is also needed. SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, - OpsVT, NewN0, NewN1); + ReducedVT, NewN0, NewN1); // Repack the lower part and higher part result of mul into a wider - // result. Make sure the type of mul result is VT. - MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); - SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); - Res = DAG.getBitcast(ResVT, Res); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); + // result. + // Generate shuffle functioning as punpcklwd. + SmallVector<int, 16> ShuffleMask(NumElts); + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i; + ShuffleMask[2 * i + 1] = i + NumElts; + } + SDValue ResLo = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResLo = DAG.getBitcast(ResVT, ResLo); + // Generate shuffle functioning as punpckhwd. + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i + NumElts / 2; + ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; + } + SDValue ResHi = + DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); + ResHi = DAG.getBitcast(ResVT, ResHi); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); } static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, @@ -37365,8 +38025,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. // Also allow v2i32 if it will be widened. MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) || - DAG.getTargetLoweringInfo().isTypeLegal(WVT))) + if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT)) return SDValue(); SDValue N0 = N->getOperand(0); @@ -37919,7 +38578,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, if (NewShiftVal >= NumBitsPerElt) NewShiftVal = NumBitsPerElt - 1; return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0), - DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8)); + DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8)); } // We can decode 'whole byte' logical bit shifts as shuffles. @@ -38039,7 +38698,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasAVX512()) { SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, - DAG.getConstant(x86cc, DL, MVT::i8)); + DAG.getTargetConstant(x86cc, DL, MVT::i8)); // Need to fill with zeros to ensure the bitcast will produce zeroes // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, @@ -38048,10 +38707,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL, N->getSimpleValueType(0)); } - SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, - CMP00.getValueType(), CMP00, CMP01, - DAG.getConstant(x86cc, DL, - MVT::i8)); + SDValue OnesOrZeroesF = + DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, + CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8)); bool is64BitFP = (CMP00.getValueType() == MVT::f64); MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; @@ -38083,34 +38741,6 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// Match (xor X, -1) -> X. -// Match extract_subvector(xor X, -1) -> extract_subvector(X). -// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). -static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { - V = peekThroughBitcasts(V); - if (V.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) - return V.getOperand(0); - if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && - (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { - if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { - Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), - Not, V.getOperand(1)); - } - } - SmallVector<SDValue, 2> CatOps; - if (collectConcatOps(V.getNode(), CatOps)) { - for (SDValue &CatOp : CatOps) { - SDValue NotCat = IsNOT(CatOp, DAG); - if (!NotCat) return SDValue(); - CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); - } - return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); - } - return SDValue(); -} - /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::AND); @@ -38273,7 +38903,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); unsigned ShiftVal = SplatVal.countTrailingOnes(); - SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8); + SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8); SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt); return DAG.getBitcast(N->getValueType(0), Shift); } @@ -38499,7 +39129,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector<SDValue, 2> SrcOps; - if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) && + if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) && SrcOps.size() == 1) { SDLoc dl(N); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); @@ -38570,7 +39200,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, } if (SDValue Shuffle = combineX86ShufflesRecursively( - {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2, + {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); @@ -38585,7 +39215,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); - EVT VT = N->getValueType(0); + MVT VT = N->getSimpleValueType(0); if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0) return SDValue(); @@ -38594,10 +39224,12 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) return SDValue(); - // On XOP we'll lower to PCMOV so accept one use, otherwise only - // do this if either mask has multiple uses already. - if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() || - !N1.getOperand(1).hasOneUse())) + // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use + // VPTERNLOG. Otherwise only do this if either mask has multiple uses already. + bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) || + Subtarget.hasVLX(); + if (!(Subtarget.hasXOP() || UseVPTERNLOG || + !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse())) return SDValue(); // Attempt to extract constant byte masks. @@ -38895,6 +39527,24 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, DAG.getBitcast(MVT::v4f32, N1))); } + // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. + // TODO: Support multiple SrcOps. + if (VT == MVT::i1) { + SmallVector<SDValue, 2> SrcOps; + if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && + SrcOps.size() == 1) { + SDLoc dl(N); + unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); + EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); + SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); + if (Mask) { + APInt AllBits = APInt::getNullValue(NumElts); + return DAG.getSetCC(dl, MVT::i1, Mask, + DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); + } + } + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -39136,26 +39786,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones); } -/// Check if truncation with saturation form type \p SrcVT to \p DstVT -/// is valid for the given \p Subtarget. -static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasAVX512()) - return false; - - // FIXME: Scalar type may be supported if we move it to vector register. - if (!SrcVT.isVector()) - return false; - - EVT SrcElVT = SrcVT.getScalarType(); - EVT DstElVT = DstVT.getScalarType(); - if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32) - return false; - if (SrcVT.is512BitVector() || Subtarget.hasVLX()) - return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI(); - return false; -} - /// Detect patterns of truncation with unsigned saturation: /// /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). @@ -39253,64 +39883,61 @@ static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) { return SDValue(); } -/// Detect a pattern of truncation with signed saturation. -/// The types should allow to use VPMOVSS* instruction on AVX512. -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. -static SDValue detectAVX512SSatPattern(SDValue In, EVT VT, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { - if (!TLI.isTypeLegal(In.getValueType())) - return SDValue(); - if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) - return SDValue(); - return detectSSatPattern(In, VT); -} - -/// Detect a pattern of truncation with saturation: -/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). -/// The types should allow to use VPMOVUS* instruction on AVX512. -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. -static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG, - const SDLoc &DL, - const X86Subtarget &Subtarget, - const TargetLowering &TLI) { - if (!TLI.isTypeLegal(In.getValueType())) - return SDValue(); - if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) - return SDValue(); - return detectUSatPattern(In, VT, DAG, DL); -} - static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - EVT SVT = VT.getScalarType(); + if (!Subtarget.hasSSE2() || !VT.isVector()) + return SDValue(); + + EVT SVT = VT.getVectorElementType(); EVT InVT = In.getValueType(); - EVT InSVT = InVT.getScalarType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) && - isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) { - if (auto SSatVal = detectSSatPattern(In, VT)) - return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); - if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) - return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); - } - if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && - !Subtarget.hasAVX512() && + EVT InSVT = InVT.getVectorElementType(); + + // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is + // split across two registers. We can use a packusdw+perm to clamp to 0-65535 + // and concatenate at the same time. Then we can use a final vpmovuswb to + // clip to 0-255. + if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + InVT == MVT::v16i32 && VT == MVT::v16i8) { + if (auto USatVal = detectSSatPattern(In, VT, true)) { + // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. + SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, + DL, DAG, Subtarget); + assert(Mid && "Failed to pack!"); + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid); + } + } + + // vXi32 truncate instructions are available with AVX512F. + // vXi16 truncate instructions are only available with AVX512BW. + // For 256-bit or smaller vectors, we require VLX. + // FIXME: We could widen truncates to 512 to remove the VLX restriction. + // If the result type is 256-bits or larger and we have disable 512-bit + // registers, we should go ahead and use the pack instructions if possible. + bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || + (Subtarget.hasBWI() && InSVT == MVT::i16)) && + (InVT.getSizeInBits() > 128) && + (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && + !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); + + if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 && + VT.getSizeInBits() >= 64 && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (auto USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). + // Only do this when the result is at least 64 bits or we'll leaving + // dangling PACKSSDW nodes. if (SVT == MVT::i8 && InSVT == MVT::i32) { EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, DAG, Subtarget); - if (Mid) - return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, - Subtarget); + assert(Mid && "Failed to pack!"); + SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, + Subtarget); + assert(V && "Failed to pack!"); + return V; } else if (SVT == MVT::i8 || Subtarget.hasSSE41()) return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, Subtarget); @@ -39319,6 +39946,42 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, Subtarget); } + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && + Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) { + unsigned TruncOpc; + SDValue SatVal; + if (auto SSatVal = detectSSatPattern(In, VT)) { + SatVal = SSatVal; + TruncOpc = X86ISD::VTRUNCS; + } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) { + SatVal = USatVal; + TruncOpc = X86ISD::VTRUNCUS; + } + if (SatVal) { + unsigned ResElts = VT.getVectorNumElements(); + // If the input type is less than 512 bits and we don't have VLX, we need + // to widen to 512 bits. + if (!Subtarget.hasVLX() && !InVT.is512BitVector()) { + unsigned NumConcats = 512 / InVT.getSizeInBits(); + ResElts *= NumConcats; + SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT)); + ConcatOps[0] = SatVal; + InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, + NumConcats * InVT.getVectorNumElements()); + SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps); + } + // Widen the result if its narrower than 128 bits. + if (ResElts * SVT.getSizeInBits() < 128) + ResElts = 128 / SVT.getSizeInBits(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts); + SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + } + } + return SDValue(); } @@ -39377,7 +40040,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return true; }; - // Check if each element of the vector is left-shifted by one. + // Check if each element of the vector is right-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); if (!IsConstVectorInRange(RHS, 1, 1)) @@ -39679,90 +40342,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::EXTLOAD) - return SDValue(); - - // Resolve extending loads. - EVT VT = Mld->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - EVT LdVT = Mld->getMemoryVT(); - SDLoc dl(Mld); - - assert(LdVT != VT && "Cannot extend to the same type"); - unsigned ToSz = VT.getScalarSizeInBits(); - unsigned FromSz = LdVT.getScalarSizeInBits(); - // From/To sizes and ElemCount must be pow of two. - assert (isPowerOf2_32(NumElems * FromSz * ToSz) && - "Unexpected size for extending masked load"); - - unsigned SizeRatio = ToSz / FromSz; - assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), - LdVT.getScalarType(), NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - // Convert PassThru value. - SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru()); - if (!Mld->getPassThru().isUndef()) { - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - - // Can't shuffle using an illegal type. - assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && - "WideVecVT should be legal"); - WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru, - DAG.getUNDEF(WideVecVT), ShuffleVec); - } - - // Prepare the new mask. - SDValue NewMask; - SDValue Mask = Mld->getMask(); - if (Mask.getValueType() == VT) { - // Mask and original value have the same type. - NewMask = DAG.getBitcast(WideVecVT, Mask); - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) - ShuffleVec[i] = NumElems * SizeRatio; - NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, - DAG.getConstant(0, dl, WideVecVT), - ShuffleVec); - } else { - assert(Mask.getValueType().getVectorElementType() == MVT::i1); - unsigned WidenNumElts = NumElems*SizeRatio; - unsigned MaskNumElts = VT.getVectorNumElements(); - EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - WidenNumElts); - - unsigned NumConcat = WidenNumElts / MaskNumElts; - SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); - SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); - Ops[0] = Mask; - NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); - } - - SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), - Mld->getBasePtr(), NewMask, WidePassThru, - Mld->getMemoryVT(), Mld->getMemOperand(), - ISD::NON_EXTLOAD); - - SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio] = i; - - // Can't shuffle using an illegal type. - assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && - "WideVecVT should be legal"); - SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), ShuffleVec); - SlicedVec = DAG.getBitcast(VT, SlicedVec); - - return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); + return SDValue(); } /// If exactly one element of the mask is set for a non-truncating masked store, @@ -39800,123 +40380,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); - EVT StVT = Mst->getMemoryVT(); SDLoc dl(Mst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!Mst->isTruncatingStore()) { - if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) - return ScalarStore; - - // If the mask value has been legalized to a non-boolean vector, try to - // simplify ops leading up to it. We only demand the MSB of each lane. - SDValue Mask = Mst->getMask(); - if (Mask.getScalarValueSizeInBits() != 1) { - APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) - return SDValue(N, 0); - } - - // TODO: AVX512 targets should also be able to simplify something like the - // pattern above, but that pattern will be different. It will either need to - // match setcc more generally or match PCMPGTM later (in tablegen?). - - SDValue Value = Mst->getValue(); - if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && - TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), - Mst->getMemoryVT())) { - return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), - Mst->getBasePtr(), Mask, - Mst->getMemoryVT(), Mst->getMemOperand(), true); - } - + if (Mst->isTruncatingStore()) return SDValue(); - } - - // Resolve truncating stores. - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromSz = VT.getScalarSizeInBits(); - unsigned ToSz = StVT.getScalarSizeInBits(); - - // The truncating store is legal in some cases. For example - // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw - // are designated for truncate store. - // In this case we don't need any further transformations. - if (TLI.isTruncStoreLegal(VT, StVT)) - return SDValue(); + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + return ScalarStore; - // From/To sizes and ElemCount must be pow of two. - assert (isPowerOf2_32(NumElems * FromSz * ToSz) && - "Unexpected size for truncating masked store"); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - assert (((NumElems * FromSz) % ToSz) == 0 && - "Unexpected ratio for truncating masked store"); - - unsigned SizeRatio = FromSz / ToSz; - assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), - StVT.getScalarType(), NumElems*SizeRatio); - - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue()); - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - - // Can't shuffle using an illegal type. - assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && - "WideVecVT should be legal"); - - SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - ShuffleVec); - - SDValue NewMask; + // If the mask value has been legalized to a non-boolean vector, try to + // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); - if (Mask.getValueType() == VT) { - // Mask and original value have the same type. - NewMask = DAG.getBitcast(WideVecVT, Mask); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; - NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, - DAG.getConstant(0, dl, WideVecVT), - ShuffleVec); - } else { - assert(Mask.getValueType().getVectorElementType() == MVT::i1); - unsigned WidenNumElts = NumElems*SizeRatio; - unsigned MaskNumElts = VT.getVectorNumElements(); - EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - WidenNumElts); + if (Mask.getScalarValueSizeInBits() != 1) { + APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + return SDValue(N, 0); + } - unsigned NumConcat = WidenNumElts / MaskNumElts; - SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); - SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); - Ops[0] = Mask; - NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, - Mst->getBasePtr(), NewMask, StVT, - Mst->getMemOperand(), false); + return SDValue(); } static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { StoreSDNode *St = cast<StoreSDNode>(N); - EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); unsigned Alignment = St->getAlignment(); - SDValue StoredVal = St->getOperand(1); + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Convert a store of vXi1 into a store of iX and a bitcast. @@ -39986,8 +40488,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, St->getMemOperand()->getFlags()); } - // If we are saving a concatenation of two XMM registers and 32-byte stores - // are slow, such as on Sandy Bridge, perform two 16-byte stores. + // If we are saving a 32-byte vector and 32-byte stores are slow, such as on + // Sandy Bridge, perform two 16-byte stores. bool Fast; if (VT.is256BitVector() && StVT == VT && TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, @@ -40026,13 +40528,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && St->getValue().getOpcode() == ISD::TRUNCATE && St->getValue().getOperand(0).getValueType() == MVT::v16i16 && - TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) && - !DCI.isBeforeLegalizeOps()) { + TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && + St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue()); return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), MVT::v16i8, St->getMemOperand()); } + // Try to fold a VTRUNCUS or VTRUNCS into a truncating store. + if (!St->isTruncatingStore() && StoredVal.hasOneUse() && + (StoredVal.getOpcode() == X86ISD::VTRUNCUS || + StoredVal.getOpcode() == X86ISD::VTRUNCS) && + TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) { + bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS; + return EmitTruncSStore(IsSigned, St->getChain(), + dl, StoredVal.getOperand(0), St->getBasePtr(), + VT, St->getMemOperand(), DAG); + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. @@ -40040,100 +40553,26 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Check if we can detect an AVG pattern from the truncation. If yes, // replace the trunc store by a normal store with the result of X86ISD::AVG // instruction. - if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, - Subtarget, dl)) - return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); - - if (SDValue Val = - detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget, - TLI)) - return EmitTruncSStore(true /* Signed saturation */, St->getChain(), - dl, Val, St->getBasePtr(), - St->getMemoryVT(), St->getMemOperand(), DAG); - if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), - DAG, dl, Subtarget, TLI)) - return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), - dl, Val, St->getBasePtr(), - St->getMemoryVT(), St->getMemOperand(), DAG); - - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromSz = VT.getScalarSizeInBits(); - unsigned ToSz = StVT.getScalarSizeInBits(); - - // The truncating store is legal in some cases. For example - // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw - // are designated for truncate store. - // In this case we don't need any further transformations. - if (TLI.isTruncStoreLegalOrCustom(VT, StVT)) - return SDValue(); - - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromSz) % ToSz) return SDValue(); - - unsigned SizeRatio = FromSz / ToSz; - - assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); - - // Create a type on which we perform the shuffle - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), - StVT.getScalarType(), NumElems*SizeRatio); - - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - - SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue()); - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i] = i * SizeRatio; + if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT())) + if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, + Subtarget, dl)) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) - return SDValue(); - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - ShuffleVec); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. - - // Find the largest store unit - MVT StoreType = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) - StoreType = Tp; - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 && - (64 <= NumElems * ToSz)) - StoreType = MVT::f64; - - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/StoreType.getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff); - SmallVector<SDValue, 8> Chains; - SDValue Ptr = St->getBasePtr(); - - // Perform one or more big stores into memory. - for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - StoreType, ShuffWide, - DAG.getIntPtrConstant(i, dl)); - SDValue Ch = - DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(), - St->getAlignment(), St->getMemOperand()->getFlags()); - Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl); - Chains.push_back(Ch); + if (TLI.isTruncStoreLegal(VT, StVT)) { + if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT())) + return EmitTruncSStore(true /* Signed saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); + if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(), + DAG, dl)) + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, Val, St->getBasePtr(), + St->getMemoryVT(), St->getMemOperand(), DAG); } - return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + return SDValue(); } // Turn load->store of MMX types into GPR load/stores. This avoids clobbering @@ -40149,11 +40588,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); - if (((VT.isVector() && !VT.isFloatingPoint()) || - (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) && + if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) && isa<LoadSDNode>(St->getValue()) && - !cast<LoadSDNode>(St->getValue())->isVolatile() && - St->getChain().hasOneUse() && !St->isVolatile()) { + cast<LoadSDNode>(St->getValue())->isSimple() && + St->getChain().hasOneUse() && St->isSimple()) { LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode()); SmallVector<SDValue, 8> Ops; @@ -40595,8 +41033,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Requires SSE2 but AVX512 has fast truncate. - if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + // Requires SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) @@ -40620,6 +41058,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); + // AVX512 has fast truncate, but if the input is already going to be split, + // there's no harm in trying pack. + if (Subtarget.hasAVX512() && + !(!Subtarget.useAVX512Regs() && VT.is256BitVector() && + InVT.is512BitVector())) + return SDValue(); + unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; @@ -40658,9 +41103,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, // Only handle vXi16 types that are at least 128-bits unless they will be // widened. - if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 || - (!ExperimentalVectorWideningLegalization && - VT.getVectorNumElements() < 8)) + if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) return SDValue(); // Input type should be vXi32. @@ -40874,6 +41317,19 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return combineVectorTruncation(N, DAG, Subtarget); } +static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue In = N->getOperand(0); + SDLoc DL(N); + + if (auto SSatVal = detectSSatPattern(In, VT)) + return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); + if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + + return SDValue(); +} + /// Returns the negated value if the node \p N flips sign of FP value. /// /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000) @@ -40883,10 +41339,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, /// In this case we go though all bitcasts. /// This also recognizes splat of a negated value and returns the splat of that /// value. -static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { +static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { if (N->getOpcode() == ISD::FNEG) return N->getOperand(0); + // Don't recurse exponentially. + if (Depth > SelectionDAG::MaxRecursionDepth) + return SDValue(); + unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); SDValue Op = peekThroughBitcasts(SDValue(N, 0)); @@ -40900,7 +41360,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. if (!SVOp->getOperand(1).isUndef()) return SDValue(); - if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode())) + if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1)) if (NegOp0.getValueType() == VT) // FIXME: Can we do better? return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT), SVOp->getMask()); @@ -40914,7 +41374,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { SDValue InsVal = Op.getOperand(1); if (!InsVector.isUndef()) return SDValue(); - if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode())) + if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1)) if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, NegInsVal, Op.getOperand(2)); @@ -40951,6 +41411,57 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) { return SDValue(); } +static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, + bool NegRes) { + if (NegMul) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::FMA: Opcode = X86ISD::FNMADD; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMADD: Opcode = ISD::FMA; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; + } + } + + if (NegAcc) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::FMA: Opcode = X86ISD::FMSUB; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; + case X86ISD::FMSUB: Opcode = ISD::FMA; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; + case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; + case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; + case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; + } + } + + if (NegRes) { + switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); + case ISD::FMA: Opcode = X86ISD::FNMSUB; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break; + case X86ISD::FNMSUB: Opcode = ISD::FMA; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break; + } + } + + return Opcode; +} + /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -40980,29 +41491,123 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, // If we're negating an FMA node, then we can adjust the // instruction to include the extra negation. - unsigned NewOpcode = 0; if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) { switch (Arg.getOpcode()) { - case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; - case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break; - case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; - case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; - case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break; - // We can't handle scalar intrinsic node here because it would only - // invert one element and not the whole vector. But we could try to handle - // a negation of the lower element only. - } - } - if (NewOpcode) - return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, - Arg.getNode()->ops())); + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + // We can't handle scalar intrinsic node here because it would only + // invert one element and not the whole vector. But we could try to handle + // a negation of the lower element only. + unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true); + return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops())); + } + } + } return SDValue(); } +char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg patterns are removable even if they have multiple uses. + if (isFNEG(DAG, Op.getNode(), Depth)) + return 2; + + // Don't recurse exponentially. + if (Depth > SelectionDAG::MaxRecursionDepth) + return 0; + + EVT VT = Op.getValueType(); + EVT SVT = VT.getScalarType(); + switch (Op.getOpcode()) { + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || + !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + break; + + // This is always negatible for free but we might be able to remove some + // extra operand negations as well. + for (int i = 0; i != 3; ++i) { + char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V == 2) + return V; + } + return 1; + } + } + + return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + +SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg patterns are removable even if they have multiple uses. + if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) + return DAG.getBitcast(Op.getValueType(), Arg); + + EVT VT = Op.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || + !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + break; + + // This is always negatible for free but we might be able to remove some + // extra operand negations as well. + SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue()); + for (int i = 0; i != 3; ++i) { + char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V == 2) + NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + } + + bool NegA = !!NewOps[0]; + bool NegB = !!NewOps[1]; + bool NegC = !!NewOps[2]; + unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true); + + // Fill in the non-negated ops with the original values. + for (int i = 0, e = Op.getNumOperands(); i != e; ++i) + if (!NewOps[i]) + NewOps[i] = Op.getOperand(i); + return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps); + } + } + + return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); @@ -41312,8 +41917,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - // Unless the load is volatile. - if (!LN->isVolatile()) { + // Unless the load is volatile or atomic. + if (LN->isSimple()) { SDLoc dl(N); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getIntegerVT(NumBits); @@ -41347,8 +41952,8 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - // Unless the load is volatile. - if (!LN->isVolatile()) { + // Unless the load is volatile or atomic. + if (LN->isSimple()) { SDLoc dl(N); unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); MVT MemVT = MVT::getFloatingPointVT(NumBits); @@ -41724,127 +42329,6 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, DAG.getConstant(EltSizeInBits - 1, DL, VT)); } -/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or -/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating -/// with UNDEFs) of the input to vectors of the same size as the target type -/// which then extends the lowest elements. -static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - if (ExperimentalVectorWideningLegalization) - return SDValue(); - - unsigned Opcode = N->getOpcode(); - // TODO - add ANY_EXTEND support. - if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND) - return SDValue(); - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - if (!Subtarget.hasSSE2()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - EVT VT = N->getValueType(0); - EVT SVT = VT.getScalarType(); - EVT InVT = N0.getValueType(); - EVT InSVT = InVT.getScalarType(); - - // FIXME: Generic DAGCombiner previously had a bug that would cause a - // sign_extend of setcc to sometimes return the original node and tricked it - // into thinking CombineTo was used which prevented the target combines from - // running. - // Earlying out here to avoid regressions like this - // (v4i32 (sext (v4i1 (setcc (v4i16))))) - // Becomes - // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef)))) - // Type legalized to - // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32))))))) - // Leading to a packssdw+pmovsxwd - // We could write a DAG combine to fix this, but really we shouldn't be - // creating sext_invec that's forcing v8i16 into the DAG. - if (N0.getOpcode() == ISD::SETCC) - return SDValue(); - - // Input type must be a vector and we must be extending legal integer types. - if (!VT.isVector() || VT.getVectorNumElements() < 2) - return SDValue(); - if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) - return SDValue(); - if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) - return SDValue(); - - // If the input/output types are both legal then we have at least AVX1 and - // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly. - if (DAG.getTargetLoweringInfo().isTypeLegal(VT) && - DAG.getTargetLoweringInfo().isTypeLegal(InVT)) - return SDValue(); - - SDLoc DL(N); - - auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) { - EVT SrcVT = N.getValueType(); - EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(), - Size / SrcVT.getScalarSizeInBits()); - SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(), - DAG.getUNDEF(SrcVT)); - Opnds[0] = N; - return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds); - }; - - // If target-size is less than 128-bits, extend to a type that would extend - // to 128 bits, extend that and extract the original target vector. - if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) { - unsigned Scale = 128 / VT.getSizeInBits(); - EVT ExVT = - EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); - SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); - SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, - DAG.getIntPtrConstant(0, DL)); - } - - // If target-size is 128-bits (or 256-bits on AVX target), then convert to - // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT. - // Also use this if we don't have SSE41 to allow the legalizer do its job. - if (!Subtarget.hasSSE41() || VT.is128BitVector() || - (VT.is256BitVector() && Subtarget.hasAVX()) || - (VT.is512BitVector() && Subtarget.useAVX512Regs())) { - SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits()); - Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode); - return DAG.getNode(Opcode, DL, VT, ExOp); - } - - auto SplitAndExtendInReg = [&](unsigned SplitSize) { - unsigned NumVecs = VT.getSizeInBits() / SplitSize; - unsigned NumSubElts = SplitSize / SVT.getSizeInBits(); - EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts); - EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts); - - unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode); - SmallVector<SDValue, 8> Opnds; - for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) { - SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, - DAG.getIntPtrConstant(Offset, DL)); - SrcVec = ExtendVecSize(DL, SrcVec, SplitSize); - SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec); - Opnds.push_back(SrcVec); - } - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds); - }; - - // On pre-AVX targets, split into 128-bit nodes of - // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128)) - return SplitAndExtendInReg(128); - - // On pre-AVX512 targets, split into 256-bit nodes of - // ISD::*_EXTEND_VECTOR_INREG. - if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256)) - return SplitAndExtendInReg(256); - - return SDValue(); -} - // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm // result type. static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, @@ -41915,9 +42399,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT)); } - if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) - return V; - if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -41931,45 +42412,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) { - if (NegMul) { - switch (Opcode) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FNMADD; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; - case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMADD: Opcode = ISD::FMA; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; - } - } - - if (NegAcc) { - switch (Opcode) { - default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FMSUB; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; - case X86ISD::FMSUB: Opcode = ISD::FMA; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; - } - } - - return Opcode; -} - static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); @@ -41980,17 +42431,21 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); - auto invertIfNegative = [&DAG](SDValue &V) { - if (SDValue NegVal = isFNEG(DAG, V.getNode())) { - V = DAG.getBitcast(V.getValueType(), NegVal); + auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); + if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) { + V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize); return true; } // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(V.getOperand(1))) { - if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) { - NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal); + SDValue Vec = V.getOperand(0); + if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) { + SDValue NegVal = + TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize); V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), NegVal, V.getOperand(1)); return true; @@ -42009,7 +42464,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, if (!NegA && !NegB && !NegC) return SDValue(); - unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC); + unsigned NewOpcode = + negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); @@ -42017,33 +42473,27 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) +// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C) static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDLoc dl(N); EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); - SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode()); - if (!NegVal) + SDValue N2 = N->getOperand(2); + if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2) return SDValue(); - // FIXME: Should we bitcast instead? - if (NegVal.getValueType() != VT) - return SDValue(); - - unsigned NewOpcode; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected opcode!"); - case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break; - case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break; - case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break; - case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break; - } + SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize); + unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), - NegVal, N->getOperand(3)); + NegN2, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), - NegVal); + NegN2); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -42090,9 +42540,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) return V; - if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) - return V; - if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; @@ -42111,12 +42558,11 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); - unsigned NumSrcElts = N00.getValueType().getVectorNumElements(); unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { - return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128); + return concatSubVectors(N00, N01, DAG, dl); } } @@ -42159,16 +42605,30 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, !IsOrXorXorCCZero) return SDValue(); - // TODO: Use PXOR + PTEST for SSE4.1 or later? EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); + bool HasAVX = Subtarget.hasAVX(); + + // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512. + // Otherwise use PCMPEQ (plus AND) and mask testing. if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && Subtarget.hasAVX2()) || + (OpSize == 256 && HasAVX) || (OpSize == 512 && Subtarget.useAVX512Regs())) { - EVT VecVT = OpSize == 512 ? MVT::v16i32 : - OpSize == 256 ? MVT::v32i8 : - MVT::v16i8; - EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT; + bool HasPT = Subtarget.hasSSE41(); + EVT VecVT = MVT::v16i8; + EVT CmpVT = MVT::v16i8; + if (OpSize == 256) + VecVT = CmpVT = MVT::v32i8; + if (OpSize == 512) { + if (Subtarget.hasBWI()) { + VecVT = MVT::v64i8; + CmpVT = MVT::v64i1; + } else { + VecVT = MVT::v16i32; + CmpVT = MVT::v16i1; + } + } + SDValue Cmp; if (IsOrXorXorCCZero) { // This is a bitwise-combined equality comparison of 2 pairs of vectors: @@ -42179,18 +42639,38 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1)); SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0)); SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1)); - SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); - SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); - Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); + if (VecVT == CmpVT && HasPT) { + SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B); + SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D); + Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2); + } else { + SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); + SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ); + Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2); + } } else { SDValue VecX = DAG.getBitcast(VecVT, X); SDValue VecY = DAG.getBitcast(VecVT, Y); - Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); + if (VecVT == CmpVT && HasPT) { + Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); + } else { + Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); + } } // For 512-bits we want to emit a setcc that will lower to kortest. - if (OpSize == 512) - return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp), - DAG.getConstant(0xFFFF, DL, MVT::i16), CC); + if (VecVT != CmpVT) { + EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16; + SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT); + return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC); + } + if (HasPT) { + SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, + Cmp); + SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp); + X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; + SDValue SetCC = getSETCC(X86CC, PT, DL, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0)); + } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne @@ -42270,8 +42750,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, // go through type promotion to a 128-bit vector. if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && VT.getVectorElementType() == MVT::i1 && - (ExperimentalVectorWideningLegalization || - VT.getVectorNumElements() > 4) && (OpVT.getVectorElementType() == MVT::i8 || OpVT.getVectorElementType() == MVT::i16)) { SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS, @@ -42289,7 +42767,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, } static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue Src = N->getOperand(0); MVT SrcVT = Src.getSimpleValueType(); MVT VT = N->getSimpleValueType(0); @@ -42310,7 +42789,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, // Look through int->fp bitcasts that don't change the element width. unsigned EltWidth = SrcVT.getScalarSizeInBits(); - if (Src.getOpcode() == ISD::BITCAST && + if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST && Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); @@ -42334,71 +42813,123 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // With vector masks we only demand the upper bit of the mask. + SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask(); + if (Mask.getScalarValueSizeInBits() != 1) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + return SDValue(N, 0); + } + + return SDValue(); +} + static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); + auto *GorS = cast<MaskedGatherScatterSDNode>(N); + SDValue Chain = GorS->getChain(); + SDValue Index = GorS->getIndex(); + SDValue Mask = GorS->getMask(); + SDValue Base = GorS->getBasePtr(); + SDValue Scale = GorS->getScale(); - if (DCI.isBeforeLegalizeOps()) { - SDValue Index = N->getOperand(4); - // Remove any sign extends from 32 or smaller to larger than 32. - // Only do this before LegalizeOps in case we need the sign extend for - // legalization. - if (Index.getOpcode() == ISD::SIGN_EXTEND) { - if (Index.getScalarValueSizeInBits() > 32 && - Index.getOperand(0).getScalarValueSizeInBits() <= 32) { - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); - SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); - if (Res == N) { - // The original sign extend has less users, add back to worklist in - // case it needs to be removed - DCI.AddToWorklist(Index.getNode()); - DCI.AddToWorklist(N); + if (DCI.isBeforeLegalize()) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); + + // Shrink constant indices if they are larger than 32-bits. + // Only do this before legalize types since v2i64 could become v2i32. + // FIXME: We could check that the type is legal if we're after legalize + // types, but then we would need to construct test cases where that happens. + // FIXME: We could support more than just constant vectors, but we need to + // careful with costing. A truncate that can be optimized out would be fine. + // Otherwise we might only want to create a truncate if it avoids a split. + if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) { + if (BV->isConstant() && IndexWidth > 32 && + DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + unsigned NumElts = Index.getValueType().getVectorNumElements(); + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); } - return SDValue(Res, 0); - } + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); + } + } + + // Shrink any sign/zero extends from 32 or smaller to larger than 32 if + // there are sufficient sign bits. Only do this before legalize types to + // avoid creating illegal types in truncate. + if ((Index.getOpcode() == ISD::SIGN_EXTEND || + Index.getOpcode() == ISD::ZERO_EXTEND) && + IndexWidth > 32 && + Index.getOperand(0).getScalarValueSizeInBits() <= 32 && + DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + unsigned NumElts = Index.getValueType().getVectorNumElements(); + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); + } + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); } + } + + if (DCI.isBeforeLegalizeOps()) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); // Make sure the index is either i32 or i64 - unsigned ScalarSize = Index.getScalarValueSizeInBits(); - if (ScalarSize != 32 && ScalarSize != 64) { - MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; + if (IndexWidth != 32 && IndexWidth != 64) { + MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index; - SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); - if (Res == N) - DCI.AddToWorklist(N); - return SDValue(Res, 0); - } - - // Try to remove zero extends from 32->64 if we know the sign bit of - // the input is zero. - if (Index.getOpcode() == ISD::ZERO_EXTEND && - Index.getScalarValueSizeInBits() == 64 && - Index.getOperand(0).getScalarValueSizeInBits() == 32) { - if (DAG.SignBitIsZero(Index.getOperand(0))) { - SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); - SDNode *Res = DAG.UpdateNodeOperands(N, NewOps); - if (Res == N) { - // The original sign extend has less users, add back to worklist in - // case it needs to be removed - DCI.AddToWorklist(Index.getNode()); - DCI.AddToWorklist(N); - } - return SDValue(Res, 0); - } - } - } - - // With AVX2 we only demand the upper bit of the mask. - if (!Subtarget.hasAVX512()) { + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Chain, Gather->getPassThru(), + Mask, Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); + } + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Chain, Scatter->getValue(), + Mask, Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); + } + } + + // With vector masks we only demand the upper bit of the mask. + if (Mask.getScalarValueSizeInBits() != 1) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Mask = N->getOperand(2); APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); @@ -42432,7 +42963,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { - SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); + SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); } @@ -42549,6 +43080,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, } static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // First try to optimize away the conversion entirely when it's // conditionally from a constant. Vectors only. @@ -42578,13 +43110,22 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, unsigned BitWidth = InVT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); if (NumSignBits >= (BitWidth - 31)) { - EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32); + EVT TruncVT = MVT::i32; if (InVT.isVector()) TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, InVT.getVectorNumElements()); SDLoc dl(N); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); - return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); + if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) { + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); + } + // If we're after legalize and the type is v2i32 we need to shuffle and + // use CVTSI2P. + assert(InVT == MVT::v2i64 && "Unexpected VT!"); + SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); + SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, + { 0, 2, -1, -1 }); + return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf); } } @@ -42604,7 +43145,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasDQI() && VT != MVT::f80) return SDValue(); - if (!Ld->isVolatile() && !VT.isVector() && + if (Ld->isSimple() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget.is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD( @@ -42841,12 +43382,12 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); EVT VT = N->getValueType(0); SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); - SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, - DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, - MVT::i8), - N->getOperand(2)), - DAG.getConstant(1, DL, VT)); + SDValue Res1 = + DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + N->getOperand(2)), + DAG.getConstant(1, DL, VT)); return DCI.CombineTo(N, Res1, CarryOut); } @@ -42906,7 +43447,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Y.getOperand(1)); } @@ -42924,7 +43465,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { EFLAGS.getOperand(1), EFLAGS.getOperand(0)); SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), NewEFLAGS); } } @@ -42984,7 +43525,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), SDValue(Neg.getNode(), 1)); } @@ -42997,7 +43538,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { SDValue One = DAG.getConstant(1, DL, ZVT); SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1); + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1); } } @@ -43025,9 +43566,6 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - EVT VT = N->getValueType(0); // If the vector size is less than 128, or greater than the supported RegSize, @@ -43035,14 +43573,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, if (!VT.isVector() || VT.getVectorNumElements() < 8) return SDValue(); - if (Op0.getOpcode() != ISD::MUL) - std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::MUL) - return SDValue(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); - ShrinkMode Mode; - if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16) - return SDValue(); + auto UsePMADDWD = [&](SDValue Op) { + ShrinkMode Mode; + return Op.getOpcode() == ISD::MUL && + canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 && + (!Subtarget.hasSSE41() || + (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + Op->isOnlyUserOf(Op.getOperand(1).getNode()))); + }; + + SDValue MulOp, OtherOp; + if (UsePMADDWD(Op0)) { + MulOp = Op0; + OtherOp = Op1; + } else if (UsePMADDWD(Op1)) { + MulOp = Op1; + OtherOp = Op0; + } else + return SDValue(); SDLoc DL(N); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, @@ -43050,34 +43601,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, VT.getVectorNumElements() / 2); + // Shrink the operands of mul. + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); + // Madd vector size is half of the original vector size auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); }; - - auto BuildPMADDWD = [&](SDValue Mul) { - // Shrink the operands of mul. - SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0)); - SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1)); - - SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, - PMADDWDBuilder); - // Fill the rest of the output with 0 - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, - DAG.getConstant(0, DL, MAddVT)); - }; - - Op0 = BuildPMADDWD(Op0); - - // It's possible that Op1 is also a mul we can reduce. - if (Op1.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) { - Op1 = BuildPMADDWD(Op1); - } - - return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); + SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, + PMADDWDBuilder); + // Fill the rest of the output with 0 + SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType()); + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); + + // Preserve the reduction flag on the ADD. We may need to revisit for the + // other operand. + SDNodeFlags Flags; + Flags.setVectorReduction(true); + return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags); } static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, @@ -43087,8 +43631,6 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); EVT VT = N->getValueType(0); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); // TODO: There's nothing special about i32, any integer type above i16 should // work just as well. @@ -43108,80 +43650,53 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, if (VT.getSizeInBits() / 4 > RegSize) return SDValue(); - // We know N is a reduction add, which means one of its operands is a phi. - // To match SAD, we need the other operand to be a ABS. - if (Op0.getOpcode() != ISD::ABS) - std::swap(Op0, Op1); - if (Op0.getOpcode() != ISD::ABS) - return SDValue(); - - auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) { - // SAD pattern detected. Now build a SAD instruction and an addition for - // reduction. Note that the number of elements of the result of SAD is less - // than the number of elements of its input. Therefore, we could only update - // part of elements in the reduction vector. - SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget); - - // The output of PSADBW is a vector of i64. - // We need to turn the vector of i64 into a vector of i32. - // If the reduction vector is at least as wide as the psadbw result, just - // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero - // anyway. - MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); - if (VT.getSizeInBits() >= ResVT.getSizeInBits()) - Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); - else - Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); - - if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Fill the upper elements with zero to match the add width. - SDValue Zero = DAG.getConstant(0, DL, VT); - Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad, - DAG.getIntPtrConstant(0, DL)); - } - - return Sad; - }; + // We know N is a reduction add. To match SAD, we need one of the operands to + // be an ABS. + SDValue AbsOp = N->getOperand(0); + SDValue OtherOp = N->getOperand(1); + if (AbsOp.getOpcode() != ISD::ABS) + std::swap(AbsOp, OtherOp); + if (AbsOp.getOpcode() != ISD::ABS) + return SDValue(); // Check whether we have an abs-diff pattern feeding into the select. SDValue SadOp0, SadOp1; - if (!detectZextAbsDiff(Op0, SadOp0, SadOp1)) - return SDValue(); - - Op0 = BuildPSADBW(SadOp0, SadOp1); - - // It's possible we have a sad on the other side too. - if (Op1.getOpcode() == ISD::ABS && - detectZextAbsDiff(Op1, SadOp0, SadOp1)) { - Op1 = BuildPSADBW(SadOp0, SadOp1); - } - - return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1); -} - -/// Convert vector increment or decrement to sub/add with an all-ones constant: -/// add X, <1, 1...> --> sub X, <-1, -1...> -/// sub X, <1, 1...> --> add X, <-1, -1...> -/// The all-ones vector constant can be materialized using a pcmpeq instruction -/// that is commonly recognized as an idiom (has no register dependency), so -/// that's better/smaller than loading a splat 1 constant. -static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { - assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && - "Unexpected opcode for increment/decrement transform"); - - // Pseudo-legality check: getOnesVector() expects one of these types, so bail - // out and wait for legalization if we have an unsupported vector length. - EVT VT = N->getValueType(0); - if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) - return SDValue(); - - APInt SplatVal; - if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue()) - return SDValue(); - - SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); - unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; - return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec); + if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1)) + return SDValue(); + + // SAD pattern detected. Now build a SAD instruction and an addition for + // reduction. Note that the number of elements of the result of SAD is less + // than the number of elements of its input. Therefore, we could only update + // part of elements in the reduction vector. + SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget); + + // The output of PSADBW is a vector of i64. + // We need to turn the vector of i64 into a vector of i32. + // If the reduction vector is at least as wide as the psadbw result, just + // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of + // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64 + // result to v2i32 which will be removed by type legalization. If we/ widen + // narrow vectors then we bitcast to v4i32 and extract v2i32. + MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); + Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); + + if (VT.getSizeInBits() > ResVT.getSizeInBits()) { + // Fill the upper elements with zero to match the add width. + assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"); + unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits(); + SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT)); + Ops[0] = Sad; + Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); + } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) { + Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad, + DAG.getIntPtrConstant(0, DL)); + } + + // Preserve the reduction flag on the ADD. We may need to revisit for the + // other operand. + SDNodeFlags Flags; + Flags.setVectorReduction(true); + return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags); } static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, @@ -43294,8 +43809,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, } // Attempt to turn this pattern into PMADDWD. -// (mul (add (zext (build_vector)), (zext (build_vector))), -// (add (zext (build_vector)), (zext (build_vector))) +// (mul (add (sext (build_vector)), (sext (build_vector))), +// (add (sext (build_vector)), (sext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -43415,6 +43930,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { const SDNodeFlags Flags = N->getFlags(); if (Flags.hasVectorReduction()) { @@ -43445,8 +43961,29 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, HADDBuilder); } - if (SDValue V = combineIncDecVector(N, DAG)) - return V; + // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into + // (sub Y, (sext (vXi1 X))). + // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in + // generic DAG combine without a legal type check, but adding this there + // caused regressions. + if (VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (Op0.getOpcode() == ISD::ZERO_EXTEND && + Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && + TLI.isTypeLegal(Op0.getOperand(0).getValueType())) { + SDLoc DL(N); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt); + } + + if (Op1.getOpcode() == ISD::ZERO_EXTEND && + Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && + TLI.isTypeLegal(Op1.getOperand(0).getValueType())) { + SDLoc DL(N); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0)); + return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt); + } + } return combineAddOrSubToADCOrSBB(N, DAG); } @@ -43457,13 +43994,15 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + // PSUBUS is supported, starting from SSE2, but truncation for v8i32 // is only worth it with SSSE3 (PSHUFB). - if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && + EVT EltVT = VT.getVectorElementType(); + if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) && !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) && - !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && - !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 || - VT == MVT::v16i32 || VT == MVT::v8i64))) + !(Subtarget.useBWIRegs() && (VT == MVT::v16i32))) return SDValue(); SDValue SubusLHS, SubusRHS; @@ -43493,16 +44032,13 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, } else return SDValue(); - auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef<SDValue> Ops) { - return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); - }; - // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with // special preprocessing in some cases. - if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, - { SubusLHS, SubusRHS }, USUBSATBuilder); + if (EltVT == MVT::i8 || EltVT == MVT::i16) + return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS); + + assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && + "Unexpected VT!"); // Special preprocessing case can be only applied // if the value was zero extended from 16 bit, @@ -43531,15 +44067,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SDValue NewSubusLHS = DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType); SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); - SDValue Psubus = - SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType, - { NewSubusLHS, NewSubusRHS }, USUBSATBuilder); + SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType, + NewSubusLHS, NewSubusRHS); + // Zero extend the result, it may be used somewhere as 32 bit, // if not zext and following trunc will shrink. return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); } static SDValue combineSub(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -43576,9 +44113,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, HSUBBuilder); } - if (SDValue V = combineIncDecVector(N, DAG)) - return V; - // Try to create PSUBUS if SUB's argument is max/min if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) return V; @@ -43712,14 +44246,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } - // If we're inserting all zeros into the upper half, change this to - // an insert into an all zeros vector. We will match this to a move - // with implicit upper bit zeroing during isel. - if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode())) - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, - getZeroVector(VT, Subtarget, DAG, DL), Ops[0], - DAG.getIntPtrConstant(0, DL)); - return SDValue(); } @@ -43786,10 +44312,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // least as large as the original insertion. Just insert the original // subvector into a zero vector. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && - SubVec.getConstantOperandAPInt(1) == 0 && + isNullConstant(SubVec.getOperand(1)) && SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Ins = SubVec.getOperand(0); - if (Ins.getConstantOperandAPInt(2) == 0 && + if (isNullConstant(Ins.getOperand(2)) && ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, @@ -43825,31 +44351,42 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // Match concat_vector style patterns. SmallVector<SDValue, 2> SubVectorOps; - if (collectConcatOps(N, SubVectorOps)) + if (collectConcatOps(N, SubVectorOps)) { if (SDValue Fold = combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) return Fold; - // If we are inserting into both halves of the vector, the starting vector - // should be undef. If it isn't, make it so. Only do this if the early insert - // has no other uses. - // TODO: Should this be a generic DAG combine? - // TODO: Why doesn't SimplifyDemandedVectorElts catch this? - if ((IdxVal == OpVT.getVectorNumElements() / 2) && - Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 && - isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() && - Vec.hasOneUse()) { - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), - Vec.getOperand(1), Vec.getOperand(2)); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, - N->getOperand(2)); + // If we're inserting all zeros into the upper half, change this to + // a concat with zero. We will match this to a move + // with implicit upper bit zeroing during isel. + // We do this here because we don't want combineConcatVectorOps to + // create INSERT_SUBVECTOR from CONCAT_VECTORS. + if (SubVectorOps.size() == 2 && + ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, + getZeroVector(OpVT, Subtarget, DAG, dl), + SubVectorOps[0], DAG.getIntPtrConstant(0, dl)); } // If this is a broadcast insert into an upper undef, use a larger broadcast. if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); + // If this is a broadcast load inserted into an upper undef, use a larger + // broadcast load. + if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() && + SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec); + SDVTList Tys = DAG.getVTList(OpVT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + return SDValue(); } @@ -43928,12 +44465,15 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, return SDValue(); MVT VT = N->getSimpleValueType(0); - EVT WideVecVT = N->getOperand(0).getValueType(); - SDValue WideVec = peekThroughBitcasts(N->getOperand(0)); + SDValue InVec = N->getOperand(0); + SDValue InVecBC = peekThroughBitcasts(InVec); + EVT InVecVT = InVec.getValueType(); + EVT InVecBCVT = InVecBC.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && - TLI.isTypeLegal(WideVecVT) && - WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) { + TLI.isTypeLegal(InVecVT) && + InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) { auto isConcatenatedNot = [] (SDValue V) { V = peekThroughBitcasts(V); if (!isBitwiseNot(V)) @@ -43941,12 +44481,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, SDValue NotOp = V->getOperand(0); return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; }; - if (isConcatenatedNot(WideVec.getOperand(0)) || - isConcatenatedNot(WideVec.getOperand(1))) { + if (isConcatenatedNot(InVecBC.getOperand(0)) || + isConcatenatedNot(InVecBC.getOperand(1))) { // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 - SDValue Concat = split256IntArith(WideVec, DAG); + SDValue Concat = split256IntArith(InVecBC, DAG); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - DAG.getBitcast(WideVecVT, Concat), N->getOperand(1)); + DAG.getBitcast(InVecVT, Concat), N->getOperand(1)); } } @@ -43956,7 +44496,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (SDValue V = narrowExtractedVectorSelect(N, DAG)) return V; - SDValue InVec = N->getOperand(0); unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); if (ISD::isBuildVectorAllZeros(InVec.getNode())) @@ -43976,31 +44515,42 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR - if (InVec.getOpcode() == ISD::BITCAST && - InVec.getOperand(0).getValueType().isVector()) { - SDValue SrcOp = InVec.getOperand(0); - EVT SrcVT = SrcOp.getValueType(); - unsigned SrcNumElts = SrcVT.getVectorNumElements(); - unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if (InVec != InVecBC && InVecBCVT.isVector()) { + unsigned SrcNumElts = InVecBCVT.getVectorNumElements(); + unsigned DestNumElts = InVecVT.getVectorNumElements(); if ((DestNumElts % SrcNumElts) == 0) { unsigned DestSrcRatio = DestNumElts / SrcNumElts; if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), - SrcVT.getScalarType(), NewExtNumElts); + InVecBCVT.getScalarType(), NewExtNumElts); if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; SDLoc DL(N); SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, - SrcOp, NewIndex); + InVecBC, NewIndex); return DAG.getBitcast(VT, NewExtract); } } } } + // If we are extracting from an insert into a zero vector, replace with a + // smaller insert into zero if we don't access less than the original + // subvector. Don't do this for i1 vectors. + if (VT.getVectorElementType() != MVT::i1 && + InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 && + InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) && + ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) && + InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) { + SDLoc DL(N); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + getZeroVector(VT, Subtarget, DAG, DL), + InVec.getOperand(1), InVec.getOperand(2)); + } + // If we're extracting from a broadcast then we're better off just // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. @@ -44008,11 +44558,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); + if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) { + auto *MemIntr = cast<MemIntrinsicSDNode>(InVec); + if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + } + // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { unsigned InOpcode = InVec.getOpcode(); - if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) { + if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) { // v2f64 CVTDQ2PD(v4i32). if (InOpcode == ISD::SINT_TO_FP && InVec.getOperand(0).getValueType() == MVT::v4i32) { @@ -44093,7 +44657,8 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { // Simplify PMULDQ and PMULUDQ operations. static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -44103,23 +44668,43 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS); // Multiply by zero. + // Don't return RHS as it may contain UNDEFs. if (ISD::isBuildVectorAllZeros(RHS.getNode())) - return RHS; - - // Aggressively peek through ops to get at the demanded low bits. - APInt DemandedMask = APInt::getLowBitsSet(64, 32); - SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask); - SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask); - if (DemandedLHS || DemandedRHS) - return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), - DemandedLHS ? DemandedLHS : LHS, - DemandedRHS ? DemandedRHS : RHS); + return DAG.getConstant(0, SDLoc(N), N->getValueType(0)); // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) return SDValue(N, 0); + // If the input is an extend_invec and the SimplifyDemandedBits call didn't + // convert it to any_extend_invec, due to the LegalOperations check, do the + // conversion directly to a vector shuffle manually. This exposes combine + // opportunities missed by combineExtInVec not calling + // combineX86ShufflesRecursively on SSE4.1 targets. + // FIXME: This is basically a hack around several other issues related to + // ANY_EXTEND_VECTOR_INREG. + if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && + LHS.getOperand(0).getValueType() == MVT::v4i32) { + SDLoc dl(N); + LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0), + LHS.getOperand(0), { 0, -1, 1, -1 }); + LHS = DAG.getBitcast(MVT::v2i64, LHS); + return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); + } + if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() && + (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && + RHS.getOperand(0).getValueType() == MVT::v4i32) { + SDLoc dl(N); + RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0), + RHS.getOperand(0), { 0, -1, 1, -1 }); + RHS = DAG.getBitcast(MVT::v2i64, RHS); + return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); + } + return SDValue(); } @@ -44134,7 +44719,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { auto *Ld = cast<LoadSDNode>(In); - if (!Ld->isVolatile()) { + if (Ld->isSimple()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, @@ -44150,17 +44735,6 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, } } - // Disabling for widening legalization for now. We can enable if we find a - // case that needs it. Otherwise it can be deleted when we switch to - // widening legalization. - if (ExperimentalVectorWideningLegalization) - return SDValue(); - - // Combine (ext_invec (ext_invec X)) -> (ext_invec X) - if (In.getOpcode() == N->getOpcode() && - TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType())) - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0)); - // Attempt to combine as a shuffle. // TODO: SSE41 support if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) { @@ -44173,6 +44747,20 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -44196,8 +44784,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); case X86ISD::CMP: return combineCMP(N, DAG); - case ISD::ADD: return combineAdd(N, DAG, Subtarget); - case ISD::SUB: return combineSub(N, DAG, Subtarget); + case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget); + case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); case X86ISD::ADD: case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); case X86ISD::SBB: return combineSBB(N, DAG); @@ -44214,12 +44802,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); - case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); case ISD::FNEG: return combineFneg(N, DAG, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); + case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); @@ -44299,20 +44888,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: case X86ISD::FNMSUB_RND: - case ISD::FMA: return combineFMA(N, DAG, Subtarget); + case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: - case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); - case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI); + case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI); + case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget); case X86ISD::MGATHER: - case X86ISD::MSCATTER: + case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI); case ISD::MGATHER: - case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); + case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: - case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI); + case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); + case X86ISD::KSHIFTL: + case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); } return SDValue(); @@ -44660,10 +45251,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { case 'I': case 'J': case 'K': - case 'L': - case 'M': case 'N': case 'G': + case 'L': + case 'M': + return C_Immediate; case 'C': case 'e': case 'Z': @@ -45175,8 +45767,9 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); - // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. - // Vector types. + // TODO: Handle i128 in FR128RegClass after it is tested well. + // Vector types and fp128. + case MVT::f128: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: @@ -45469,7 +46062,7 @@ void X86TargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be @@ -45514,3 +46107,16 @@ X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; } + +unsigned +X86TargetLowering::getStackProbeSize(MachineFunction &MF) const { + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + const Function &Fn = MF.getFunction(); + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackProbeSize; +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index e0be03bc3f9d..6f7e90008de4 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -17,7 +17,6 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" -#include "llvm/Target/TargetOptions.h" namespace llvm { class X86Subtarget; @@ -144,6 +143,10 @@ namespace llvm { /// relative displacements. WrapperRIP, + /// Copies a 64-bit value from an MMX vector to the low word + /// of an XMM vector, with the high word zero filled. + MOVQ2DQ, + /// Copies a 64-bit value from the low word of an XMM vector /// to an MMX vector. MOVDQ2Q, @@ -422,7 +425,8 @@ namespace llvm { // Tests Types Of a FP Values for scalar types. VFPCLASSS, - // Broadcast scalar to vector. + // Broadcast (splat) scalar or element 0 of a vector. If the operand is + // a vector, this node may change the vector length as part of the splat. VBROADCAST, // Broadcast mask to vector. VBROADCASTM, @@ -611,6 +615,9 @@ namespace llvm { // extract_vector_elt, store. VEXTRACT_STORE, + // scalar broadcast from memory + VBROADCAST_LOAD, + // Store FP control world into i16 memory. FNSTCW16m, @@ -680,6 +687,9 @@ namespace llvm { bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO); + /// If Op is a constant whose elements are all the same constant or + /// undefined, return true and return the constant value in \p SplatVal. + bool isConstantSplat(SDValue Op, APInt &SplatVal); } // end namespace X86 //===--------------------------------------------------------------------===// @@ -792,6 +802,17 @@ namespace llvm { /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + /// Return 1 if we can compute the negated form of the specified expression + /// for the same cost as the expression itself, or 2 if we can compute the + /// negated form more cheaply than the expression itself. Else return 0. + char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations, + bool ForCodeSize, unsigned Depth) const override; + + /// If isNegatibleForFree returns true, return the newly negated expression. + SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + unsigned Depth) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -840,6 +861,13 @@ namespace llvm { bool hasAndNot(SDValue Y) const override; + bool hasBitTest(SDValue X, SDValue Y) const override; + + bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const override; + bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override; @@ -863,11 +891,7 @@ namespace llvm { return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } - bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override { - if (DAG.getMachineFunction().getFunction().hasMinSize()) - return false; - return true; - } + bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override; bool shouldSplatInsEltVarIndex(EVT VT) const override; @@ -913,6 +937,10 @@ namespace llvm { TargetLoweringOpt &TLO, unsigned Depth) const override; + SDValue SimplifyMultipleUseDemandedBitsForTargetNode( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + SelectionDAG &DAG, unsigned Depth) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; SDValue unwrapAddress(SDValue N) const override; @@ -1090,11 +1118,12 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override; + bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override; bool convertSelectOfConstantsToMath(EVT VT) const override; - bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const override; @@ -1136,8 +1165,8 @@ namespace llvm { return nullptr; // nothing to do, move along. } - unsigned getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const override; + Register getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. @@ -1189,12 +1218,18 @@ namespace llvm { CallingConv::ID CC, EVT VT) const override; + unsigned getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const override; + bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool supportSwiftError() const override; StringRef getStackProbeSymbolName(MachineFunction &MF) const override; + unsigned getStackProbeSize(MachineFunction &MF) const; + bool hasVectorBlend() const override { return true; } unsigned getMaxSupportedInterleaveFactor() const override { return 4; } @@ -1326,6 +1361,12 @@ namespace llvm { SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1372,6 +1413,9 @@ namespace llvm { LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override; + bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override; + bool needsCmpXchgNb(Type *MemType) const; void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, @@ -1462,6 +1506,9 @@ namespace llvm { /// Reassociate floating point divisions into multiply by reciprocal. unsigned combineRepeatedFPDivisors() const override; + + SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, + SmallVectorImpl<SDNode *> &Created) const override; }; namespace X86 { @@ -1625,24 +1672,24 @@ namespace llvm { /// mask. This is the reverse process to canWidenShuffleElements, but can /// always succeed. template <typename T> - void scaleShuffleMask(int Scale, ArrayRef<T> Mask, + void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask, SmallVectorImpl<T> &ScaledMask) { assert(0 < Scale && "Unexpected scaling factor"); size_t NumElts = Mask.size(); ScaledMask.assign(NumElts * Scale, -1); - for (int i = 0; i != (int)NumElts; ++i) { + for (size_t i = 0; i != NumElts; ++i) { int M = Mask[i]; // Repeat sentinel values in every mask element. if (M < 0) { - for (int s = 0; s != Scale; ++s) + for (size_t s = 0; s != Scale; ++s) ScaledMask[(Scale * i) + s] = M; continue; } // Scale mask element and increment across each mask element. - for (int s = 0; s != Scale; ++s) + for (size_t s = 0; s != Scale; ++s) ScaledMask[(Scale * i) + s] = (Scale * M) + s; } } diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp index 04e8b2231fec..cc0f59ab329d 100644 --- a/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -84,7 +84,7 @@ bool X86IndirectBranchTrackingPass::addENDBR( return false; } -bool IsCallReturnTwice(llvm::MachineOperand &MOp) { +static bool IsCallReturnTwice(llvm::MachineOperand &MOp) { if (!MOp.isGlobal()) return false; auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal()); diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp index 02ae73706a34..2b1e3f23efd7 100644 --- a/lib/Target/X86/X86InsertPrefetch.cpp +++ b/lib/Target/X86/X86InsertPrefetch.cpp @@ -79,8 +79,8 @@ ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples, // The prefetch instruction can't take memory operands involving vector // registers. bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) { - unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg(); - unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg(); + Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg(); + Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg(); return (BaseReg == 0 || X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) || X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) && @@ -108,7 +108,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples, Prefetches &Prefetches) const { assert(Prefetches.empty() && "Expected caller passed empty PrefetchInfo vector."); - static const std::pair<const StringRef, unsigned> HintTypes[] = { + static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = { {"_nta_", X86::PREFETCHNTA}, {"_t0_", X86::PREFETCHT0}, {"_t1_", X86::PREFETCHT1}, @@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) { void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired<MachineModuleInfo>(); + AU.addRequired<MachineModuleInfoWrapperPass>(); } bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) { diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 54eddeacaa17..9b5de59430a5 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -74,6 +74,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName); PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName); ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), !cast<ComplexPattern>("sse_load_f32"), @@ -412,6 +413,14 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; } +let Predicates = [HasAVX512] in { +def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>; +def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; +} + // Alias instructions that allow VPTERNLOG to be used with a mask to create // a mix of all ones and all zeros elements. This is done this way to force // the same register to be used as input for all three sources. @@ -436,6 +445,19 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", [(set VR256X:$dst, (v8i32 immAllZerosV))]>; } +let Predicates = [HasAVX512] in { +def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>; +} + // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -443,7 +465,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", - [(set FR64X:$dst, fpimm0)]>; + [(set FR64X:$dst, fp64imm0)]>; + def AVX512_FsFLD0F128 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", + [(set VR128X:$dst, fp128imm0)]>; } //===----------------------------------------------------------------------===// @@ -730,14 +754,14 @@ let isCommutable = 1 in def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>; def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))]>, + timm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>, Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; } @@ -1100,75 +1124,104 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr, X86VectorVTInfo MaskInfo, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, - SDPatternOperator UnmaskedOp = X86VBroadcast> { - let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in { - defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo, - (outs MaskInfo.RC:$dst), - (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))), - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>, - T8PD, EVEX, Sched<[SchedRR]>; - let mayLoad = 1 in - defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo, - (outs MaskInfo.RC:$dst), - (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", - (MaskInfo.VT - (bitconvert - (DestInfo.VT (UnmaskedOp - (SrcInfo.ScalarLdFrag addr:$src))))), - (MaskInfo.VT - (bitconvert - (DestInfo.VT (X86VBroadcast - (SrcInfo.ScalarLdFrag addr:$src)))))>, - T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>, - Sched<[SchedRM]>; - } - - def : Pat<(MaskInfo.VT - (bitconvert - (DestInfo.VT (UnmaskedOp - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src))))))), - (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>; - def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + bit IsConvertibleToThreeAddress, + SDPatternOperator UnmaskedOp = X86VBroadcast, + SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> { + let hasSideEffects = 0 in + def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set MaskInfo.RC:$dst, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))], + DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>; + def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), + (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))))), - MaskInfo.RC:$src0)), - (!cast<Instruction>(Name#DestInfo.ZSuffix#mk) - MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>; - def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + MaskInfo.ImmAllZerosV))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>; + let Constraints = "$src0 = $dst" in + def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), + (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, + SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", + "${dst} {${mask}}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + MaskInfo.RC:$src0))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>; + + let hasSideEffects = 0, mayLoad = 1 in + def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set MaskInfo.RC:$dst, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedBcastOp addr:$src)))))], + DestInfo.ExeDomain>, T8PD, EVEX, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + + def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))))), - MaskInfo.ImmAllZerosV)), - (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz) - MaskInfo.KRCWM:$mask, addr:$src)>; + (SrcInfo.BroadcastLdFrag addr:$src)))), + MaskInfo.ImmAllZerosV))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + + let Constraints = "$src0 = $dst", + isConvertibleToThreeAddress = IsConvertibleToThreeAddress in + def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, + SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", + "${dst} {${mask}}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (SrcInfo.BroadcastLdFrag addr:$src)))), + MaskInfo.RC:$src0))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; } // Helper class to force mask and broadcast result to same type. multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name, SchedWrite SchedRR, SchedWrite SchedRM, X86VectorVTInfo DestInfo, - X86VectorVTInfo SrcInfo> : + X86VectorVTInfo SrcInfo, + bit IsConvertibleToThreeAddress> : avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM, - DestInfo, DestInfo, SrcInfo>; + DestInfo, DestInfo, SrcInfo, + IsConvertibleToThreeAddress>; multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in { defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info512, _.info128>, + WriteFShuffle256Ld, _.info512, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512, _.info128>, EVEX_V512; @@ -1176,7 +1229,7 @@ multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr, let Predicates = [HasVLX] in { defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info256, _.info128>, + WriteFShuffle256Ld, _.info256, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256, _.info128>, EVEX_V256; @@ -1187,7 +1240,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in { defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info512, _.info128>, + WriteFShuffle256Ld, _.info512, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512, _.info128>, EVEX_V512; @@ -1195,12 +1248,12 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, let Predicates = [HasVLX] in { defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info256, _.info128>, + WriteFShuffle256Ld, _.info256, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256, _.info128>, EVEX_V256; defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, - WriteFShuffle256Ld, _.info128, _.info128>, + WriteFShuffle256Ld, _.info128, _.info128, 1>, avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128, _.info128>, EVEX_V128; @@ -1284,46 +1337,35 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, X86VBroadcast, GR64, HasAVX512>, VEX_W; -// Provide aliases for broadcast from the same register class that -// automatically does the extract. -multiclass avx512_int_broadcast_rm_lowering<string Name, - X86VectorVTInfo DestInfo, - X86VectorVTInfo SrcInfo, - X86VectorVTInfo ExtInfo> { - def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), - (!cast<Instruction>(Name#DestInfo.ZSuffix#"r") - (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>; -} - multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo _, Predicate prd> { + AVX512VLVectorVTInfo _, Predicate prd, + bit IsConvertibleToThreeAddress> { let Predicates = [prd] in { defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256, - WriteShuffle256Ld, _.info512, _.info128>, - avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>, + WriteShuffle256Ld, _.info512, _.info128, + IsConvertibleToThreeAddress>, EVEX_V512; - // Defined separately to avoid redefinition. - defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>; } let Predicates = [prd, HasVLX] in { defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256, - WriteShuffle256Ld, _.info256, _.info128>, - avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>, + WriteShuffle256Ld, _.info256, _.info128, + IsConvertibleToThreeAddress>, EVEX_V256; defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle, - WriteShuffleXLd, _.info128, _.info128>, + WriteShuffleXLd, _.info128, _.info128, + IsConvertibleToThreeAddress>, EVEX_V128; } } defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", - avx512vl_i8_info, HasBWI>; + avx512vl_i8_info, HasBWI, 0>; defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", - avx512vl_i16_info, HasBWI>; + avx512vl_i16_info, HasBWI, 0>; defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", - avx512vl_i32_info, HasAVX512>; + avx512vl_i32_info, HasAVX512, 1>; defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", - avx512vl_i64_info, HasAVX512>, VEX_W1X; + avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { @@ -1354,6 +1396,10 @@ let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZm addr:$src)>; } let Predicates = [HasVLX] in { @@ -1362,6 +1408,12 @@ let Predicates = [HasVLX] in { (VPBROADCASTQZ128m addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ128m addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1382,6 +1434,12 @@ let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ256m addr:$src)>; } let Predicates = [HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1394,6 +1452,10 @@ let Predicates = [HasBWI] in { def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -1629,12 +1691,12 @@ multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, let Predicates = [HasDQI] in defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256, WriteShuffle256Ld, _Dst.info512, - _Src.info512, _Src.info128, null_frag>, + _Src.info512, _Src.info128, 0, null_frag, null_frag>, EVEX_V512; let Predicates = [HasDQI, HasVLX] in defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256, WriteShuffle256Ld, _Dst.info256, - _Src.info256, _Src.info128, null_frag>, + _Src.info256, _Src.info128, 0, null_frag, null_frag>, EVEX_V256; } @@ -1645,7 +1707,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, let Predicates = [HasDQI, HasVLX] in defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle, WriteShuffleXLd, _Dst.info128, - _Src.info128, _Src.info128, null_frag>, + _Src.info128, _Src.info128, 0, null_frag, null_frag>, EVEX_V128; } @@ -1654,23 +1716,6 @@ defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", avx512vl_f32_info, avx512vl_f64_info>; -let Predicates = [HasVLX] in { -def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))), - (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; -def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))), - (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; -} - -def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), - (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>; -def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), - (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; - -def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), - (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>; -def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), - (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; - //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- @@ -1730,7 +1775,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src2, - IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1807,7 +1852,7 @@ multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _, def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86VPermt2 _.RC:$src2, (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), - (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.BroadcastLdFrag addr:$src3)), (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; @@ -1846,7 +1891,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1947,7 +1992,7 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr, } multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { - let mayLoad = 1, hasSideEffects = 0 in { + let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in { def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, @@ -2031,9 +2076,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; + timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -2041,9 +2086,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc), + timm:$cc), (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, + timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, @@ -2052,9 +2097,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc), + timm:$cc), (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, + timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; let isCodeGenOnly = 1 in { @@ -2065,7 +2110,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, - imm:$cc))]>, + timm:$cc))]>, EVEX_4V, VEX_LIG, Sched<[sched]>; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), @@ -2074,7 +2119,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set _.KRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2), - imm:$cc))]>, + timm:$cc))]>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2100,94 +2145,82 @@ let Predicates = [HasAVX512] in { SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } -multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86FoldableSchedWrite sched, +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable, hasSideEffects = 0 in def rr : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>, - EVEX_4V, Sched<[sched]>; + []>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rm : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2))))]>, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isCommutable = IsCommutable in + []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable, hasSideEffects = 0 in def rrk : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), - [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, - EVEX_4V, EVEX_K, Sched<[sched]>; + []>, EVEX_4V, EVEX_K, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rmk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), - [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode_su (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2)))))]>, - EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, +multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> { + avx512_icmp_packed<opc, OpcodeStr, sched, _, IsCommutable> { + let mayLoad = 1, hasSideEffects = 0 in { def rmb : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), - [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, - EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), - [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode_su (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))]>, - EVEX_4V, EVEX_K, EVEX_B, + []>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } -multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86SchedWriteWidths sched, +multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, + defm Z : avx512_icmp_packed<opc, OpcodeStr, sched.ZMM, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, sched.YMM, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, sched.XMM, VTInfo.info128, IsCommutable>, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, - PatFrag OpNode, PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.ZMM, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.YMM, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.XMM, VTInfo.info128, IsCommutable>, EVEX_V128; } } @@ -2195,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, // This fragment treats X86cmpm as commutable to help match loads in both // operands for PCMPEQ. def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; -def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), - (X86setcc_commute node:$src1, node:$src2, SETEQ)>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; -def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpeqm_c node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; -def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpgtm node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -2322,8 +2344,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmibk : AVX512AIi8<opc, MRMSrcMem, @@ -2335,23 +2356,21 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, [(set _.KRC:$dst, (and _.KRCWM:$mask, (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast<Instruction>(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag_su:$cc (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast<Instruction>(Name#_.ZSuffix#"rmibk") _.KRCWM:$mask, _.RC:$src1, addr:$src2, - (CommFrag.OperandTransform $cc))>; + (CommFrag_su.OperandTransform $cc))>; } multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag, @@ -2496,14 +2515,19 @@ def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), return N->hasOneUse(); }]>; +def X86cmpm_imm_commute : SDNodeXForm<timm, [{ + uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f); + return getI8Imm(Imm, SDLoc(N)); +}]>; + multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), - (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), 1>, Sched<[sched]>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, @@ -2511,9 +2535,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), - imm:$cc), + timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), - imm:$cc)>, + timm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, @@ -2523,38 +2547,37 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, "$cc, ${src2}"#_.BroadcastStr#", $src1", "$src1, ${src2}"#_.BroadcastStr#", $cc", (X86cmpm (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc), + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc)>, + (_.VT (_.BroadcastLdFrag addr:$src2)), + timm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; // Patterns for selecting with loads in other operand. def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), - CommutableCMPCC:$cc), + timm:$cc), (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2), (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + timm:$cc)), (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), CommutableCMPCC:$cc), + def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2), + (_.VT _.RC:$src1), timm:$cc), (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + timm:$cc)), (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, - imm:$cc)>; + (X86cmpm_imm_commute timm:$cc))>; } multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -2564,9 +2587,9 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1", "$src1, $src2, {sae}, $cc", - (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc), (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, + timm:$cc)>, EVEX_B, Sched<[sched]>; } @@ -2590,12 +2613,12 @@ defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>, // Patterns to select fp compares with load as first operand. let Predicates = [HasAVX512] in { def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, - CommutableCMPCC:$cc)), - (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>; + timm:$cc)), + (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, - CommutableCMPCC:$cc)), - (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>; + timm:$cc)), + (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; } // ---------------------------------------------------------------- @@ -2621,7 +2644,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1), - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), @@ -2629,7 +2652,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclasss_su (_.VT _.RC:$src1), - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.IntScalarMemOp:$src1, i32u8imm:$src2), @@ -2637,7 +2660,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, (X86Vfpclasss _.ScalarIntMemCPat:$src1, - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2), @@ -2645,7 +2668,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclasss_su _.ScalarIntMemCPat:$src1, - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -2661,7 +2684,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1), - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), @@ -2669,7 +2692,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su (_.VT _.RC:$src1), - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), @@ -2677,7 +2700,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT (_.LdFrag addr:$src1)), - (i32 imm:$src2)))]>, + (i32 timm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), @@ -2685,7 +2708,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su (_.VT (_.LdFrag addr:$src1)), - (i32 imm:$src2))))]>, + (i32 timm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), @@ -2693,9 +2716,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, _.BroadcastStr##", $dst|$dst, ${src1}" ##_.BroadcastStr##", $src2}", [(set _.KRC:$dst,(X86Vfpclass - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2)))]>, + (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2)))]>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), @@ -2703,9 +2725,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## _.BroadcastStr##", $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2))))]>, + (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2))))]>, EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2836,13 +2857,21 @@ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), (KMOVWrk VK16:$src)>; +def : Pat<(i64 (zext (i16 (bitconvert (v16i1 VK16:$src))))), + (SUBREG_TO_REG (i64 0), (KMOVWrk VK16:$src), sub_32bit)>; def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), (COPY_TO_REGCLASS VK16:$src, GR32)>; +def : Pat<(i64 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), + (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK16:$src, GR32), sub_32bit)>; def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), (KMOVBrk VK8:$src)>, Requires<[HasDQI]>; +def : Pat<(i64 (zext (i8 (bitconvert (v8i1 VK8:$src))))), + (SUBREG_TO_REG (i64 0), (KMOVBrk VK8:$src), sub_32bit)>, Requires<[HasDQI]>; def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), (COPY_TO_REGCLASS VK8:$src, GR32)>; +def : Pat<(i64 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), + (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK8:$src, GR32), sub_32bit)>; def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (COPY_TO_REGCLASS GR32:$src, VK32)>; @@ -3075,7 +3104,7 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), !strconcat(OpcodeStr, "\t{$imm, $src, $dst|$dst, $src, $imm}"), - [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>, + [(set KRC:$dst, (OpNode KRC:$src, (i8 timm:$imm)))]>, Sched<[sched]>; } @@ -3098,30 +3127,6 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, - string InstStr, - X86VectorVTInfo Narrow, - X86VectorVTInfo Wide> { - def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2))), - (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr#"Zrr") - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; - - def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag_su (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2)))), - (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr#"Zrrk") - (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; -} - -// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, string InstStr, X86VectorVTInfo Narrow, @@ -3129,7 +3134,7 @@ multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), (Narrow.VT Narrow.RC:$src2), cond)), (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr##Zrri) + (!cast<Instruction>(InstStr#"Zrri") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), (Frag.OperandTransform $cc)), Narrow.KRC)>; @@ -3138,53 +3143,111 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, (Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1), (Narrow.VT Narrow.RC:$src2), cond)))), - (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik") (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), - (Frag.OperandTransform $cc)), Narrow.KRC)>; + (Frag_su.OperandTransform $cc)), Narrow.KRC)>; +} + +multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su, + PatFrag CommFrag, PatFrag CommFrag_su, + string InstStr, + X86VectorVTInfo Narrow, + X86VectorVTInfo Wide> { +// Broadcast load. +def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.BroadcastLdFrag addr:$src2), cond)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmib") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Narrow.KVT + (Frag_su:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.BroadcastLdFrag addr:$src2), + cond)))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>; + +// Commuted with broadcast load. +def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2), + (Narrow.VT Narrow.RC:$src1), + cond)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmib") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Narrow.KVT + (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2), + (Narrow.VT Narrow.RC:$src1), + cond)))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>; } // Same as above, but for fp types which don't use PatFrags. -multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su, - string InstStr, +multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { -def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), imm:$cc)), +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), timm:$cc)), (COPY_TO_REGCLASS - (!cast<Instruction>(InstStr##Zrri) + (!cast<Instruction>(InstStr#"Zrri") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), - imm:$cc), Narrow.KRC)>; + timm:$cc), Narrow.KRC)>; def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (OpNode_su (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), imm:$cc))), - (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (X86cmpm_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), timm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik") (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), - imm:$cc), Narrow.KRC)>; -} + timm:$cc), Narrow.KRC)>; -let Predicates = [HasAVX512, NoVLX] in { - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't - // increase the pattern complexity the way an immediate would. - let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>; +// Broadcast load. +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmbi") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, timm:$cc), Narrow.KRC)>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>; +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (X86cmpm_su (Narrow.VT Narrow.RC:$src1), + (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, timm:$cc), Narrow.KRC)>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>; +// Commuted with broadcast load. +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), + (Narrow.VT Narrow.RC:$src1), timm:$cc)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrmbi") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>; - } +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (X86cmpm_su (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), + (Narrow.VT Narrow.RC:$src1), timm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; +} +let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>; @@ -3197,29 +3260,25 @@ let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>; - defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>; -} + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>; -let Predicates = [HasBWI, NoVLX] in { - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't - // increase the pattern complexity the way an immediate would. - let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>; - } + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v4f64x_info, v8f64_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>; +} +let Predicates = [HasBWI, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>; @@ -4186,16 +4245,32 @@ def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)), (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>; +def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), (f32 FR32X:$src0))), + (COPY_TO_REGCLASS + (v4f32 (VMOVSSZrmk (v4f32 (COPY_TO_REGCLASS FR32X:$src0, VR128X)), + VK1WM:$mask, addr:$src)), + FR32X)>; +def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), fp32imm0)), + (COPY_TO_REGCLASS (v4f32 (VMOVSSZrmkz VK1WM:$mask, addr:$src)), FR32X)>; + def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)), VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; -def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)), +def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fp64imm0)), (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; +def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))), + (COPY_TO_REGCLASS + (v2f64 (VMOVSDZrmk (v2f64 (COPY_TO_REGCLASS FR64X:$src0, VR128X)), + VK1WM:$mask, addr:$src)), + FR64X)>; +def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)), + (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), @@ -4537,8 +4612,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (_.BroadcastLdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4664,8 +4738,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, "${src2}"##_Brdct.BroadcastStr##", $src1", "$src1, ${src2}"##_Brdct.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Brdct.VT (X86VBroadcast - (_Brdct.ScalarLdFrag addr:$src2))))))>, + (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4737,8 +4810,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast - (_Src.ScalarLdFrag addr:$src2))))))>, + (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4874,22 +4946,11 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - - def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), - (EXTRACT_SUBREG - (VPMULLQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), - sub_xmm)>; -} - -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasDQI, NoVLX] in { - def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), (EXTRACT_SUBREG - (VPMULLQZrr + (VPMULLQZrmb (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + addr:$src2), sub_ymm)>; def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), @@ -4898,29 +4959,47 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (VPMULLQZrmb + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } -multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> { +multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> { def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast<Instruction>(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; + def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast<Instruction>(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + addr:$src2), + sub_ymm)>; def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast<Instruction>(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast<Instruction>(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } let Predicates = [HasAVX512, NoVLX] in { - defm : avx512_min_max_lowering<VPMAXUQZrr, umax>; - defm : avx512_min_max_lowering<VPMINUQZrr, umin>; - defm : avx512_min_max_lowering<VPMAXSQZrr, smax>; - defm : avx512_min_max_lowering<VPMINSQZrr, smin>; + defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; + defm : avx512_min_max_lowering<"VPMINUQZ", umin>; + defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; + defm : avx512_min_max_lowering<"VPMINSQZ", smin>; } //===----------------------------------------------------------------------===// @@ -4977,32 +5056,6 @@ let Predicates = [HasVLX] in { def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)), (VPANDNQZ128rm VR128X:$src1, addr:$src2)>; - def : Pat<(and VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDDZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(or VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPORDZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(xor VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPXORDZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>; - - def : Pat<(and VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(or VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPORQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(xor VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPXORQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>; - def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)), (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>; def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)), @@ -5042,32 +5095,6 @@ let Predicates = [HasVLX] in { (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)), (VPANDNQZ256rm VR256X:$src1, addr:$src2)>; - - def : Pat<(and VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDDZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(or VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPORDZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(xor VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPXORDZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>; - - def : Pat<(and VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDQZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(or VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPORQZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(xor VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPXORQZ256rmb VR256X:$src1, addr:$src2)>; - def : Pat<(X86andnp VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>; } let Predicates = [HasAVX512] in { @@ -5110,32 +5137,6 @@ let Predicates = [HasAVX512] in { (VPANDNQZrm VR512:$src1, addr:$src2)>; def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)), (VPANDNQZrm VR512:$src1, addr:$src2)>; - - def : Pat<(and VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDDZrmb VR512:$src1, addr:$src2)>; - def : Pat<(or VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPORDZrmb VR512:$src1, addr:$src2)>; - def : Pat<(xor VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPXORDZrmb VR512:$src1, addr:$src2)>; - def : Pat<(X86andnp VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))), - (VPANDNDZrmb VR512:$src1, addr:$src2)>; - - def : Pat<(and VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDQZrmb VR512:$src1, addr:$src2)>; - def : Pat<(or VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPORQZrmb VR512:$src1, addr:$src2)>; - def : Pat<(xor VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPXORQZrmb VR512:$src1, addr:$src2)>; - def : Pat<(X86andnp VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))), - (VPANDNQZrmb VR512:$src1, addr:$src2)>; } // Patterns to catch vselect with different type than logic op. @@ -5174,25 +5175,17 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode, X86VectorVTInfo _, X86VectorVTInfo IntInfo> { // Register-broadcast logical operations. - def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), - (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.RC:$src0)), (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.ImmAllZerosV)), (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -5329,7 +5322,8 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, SDNode SaeNode, - X86FoldableSchedWrite sched, bit IsCommutable> { + X86FoldableSchedWrite sched, bit IsCommutable, + string EVEX2VexOvrd> { let ExeDomain = _.ExeDomain in { defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -5349,7 +5343,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]> { + Sched<[sched]>, + EVEX2VEXOverride<EVEX2VexOvrd#"rr"> { let isCommutable = IsCommutable; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5357,7 +5352,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, + EVEX2VEXOverride<EVEX2VexOvrd#"rm">; } defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -5387,10 +5383,12 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode VecNode, SDNode SaeNode, X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode, - VecNode, SaeNode, sched.PS.Scl, IsCommutable>, + VecNode, SaeNode, sched.PS.Scl, IsCommutable, + NAME#"SS">, XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode, - VecNode, SaeNode, sched.PD.Scl, IsCommutable>, + VecNode, SaeNode, sched.PD.Scl, IsCommutable, + NAME#"SD">, XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; } defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds, @@ -5410,13 +5408,14 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, - X86FoldableSchedWrite sched> { + X86FoldableSchedWrite sched, + string EVEX2VEXOvrd> { let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]> { + Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> { let isCommutable = 1; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5424,24 +5423,27 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; } } defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, - SchedWriteFCmp.Scl>, XS, EVEX_4V, - VEX_LIG, EVEX_CD8<32, CD8VT1>; + SchedWriteFCmp.Scl, "VMINCSS">, XS, + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, - SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V, - VEX_LIG, EVEX_CD8<64, CD8VT1>; + SchedWriteFCmp.Scl, "VMINCSD">, XD, + VEX_W, EVEX_4V, VEX_LIG, + EVEX_CD8<64, CD8VT1>; defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, - SchedWriteFCmp.Scl>, XS, EVEX_4V, - VEX_LIG, EVEX_CD8<32, CD8VT1>; + SchedWriteFCmp.Scl, "VMAXCSS">, XS, + EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, - SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V, - VEX_LIG, EVEX_CD8<64, CD8VT1>; + SchedWriteFCmp.Scl, "VMAXCSD">, XD, + VEX_W, EVEX_4V, VEX_LIG, + EVEX_CD8<64, CD8VT1>; multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, @@ -5464,8 +5466,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5595,8 +5596,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5751,13 +5751,13 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>, + (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>, Sched<[sched]>; defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)), - (i8 imm:$src2)))>, + (i8 timm:$src2)))>, Sched<[sched.Folded]>; } } @@ -5769,7 +5769,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", - (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>, + (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>, EVEX_B, Sched<[sched.Folded]>; } @@ -5911,17 +5911,17 @@ let Predicates = [HasAVX512, NoVLX] in { (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), VR128X:$src2)), sub_xmm)>; - def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))), + def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; - def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; + timm:$src2)), sub_xmm)>; } //===-------------------------------------------------------------------===// @@ -5953,8 +5953,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))>, + (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6062,27 +6061,27 @@ let Predicates = [HasAVX512, NoVLX] in { (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; - def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; - def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPROLDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPROLDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; } // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. @@ -6113,27 +6112,27 @@ let Predicates = [HasAVX512, NoVLX] in { (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; - def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORQZri (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; - def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))), + def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPRORDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - imm:$src2)), sub_xmm)>; - def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))), + timm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))), (EXTRACT_SUBREG (v16i32 (VPRORDZri (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - imm:$src2)), sub_ymm)>; + timm:$src2)), sub_ymm)>; } //===-------------------------------------------------------------------===// @@ -6228,8 +6227,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (Ctrl.VT (X86VBroadcast - (Ctrl.ScalarLdFrag addr:$src2)))))>, + (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6419,7 +6417,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, + _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6493,7 +6491,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6571,7 +6569,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, - (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6964,7 +6962,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -7504,14 +7502,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, OpcodeStr, "${src}"##Broadcast, "${src}"##Broadcast, (_.VT (OpNode (_Src.VT - (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + (_Src.BroadcastLdFrag addr:$src)) )), (vselect MaskRC:$mask, (_.VT (OpNode (_Src.VT - (X86VBroadcast - (_Src.ScalarLdFrag addr:$src))))), + (_Src.BroadcastLdFrag addr:$src)))), _.RC:$src0), vselect, "$src0 = $dst">, EVEX, EVEX_B, Sched<[sched.Folded]>; @@ -7646,14 +7643,14 @@ let Predicates = [HasAVX512] in { v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZrmb addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), (v8f32 VR256X:$src0)), (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; } @@ -7677,14 +7674,14 @@ let Predicates = [HasVLX] in { v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; - def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZ256rmb addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), VR128X:$src0), (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; @@ -7708,12 +7705,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))), + def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))), (VCVTPD2PSZ128rmb addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8194,12 +8191,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8223,12 +8220,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8252,12 +8249,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8281,12 +8278,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8419,12 +8416,12 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8448,12 +8445,12 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTUQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8576,21 +8573,21 @@ let ExeDomain = GenericDomain in { (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _dest.RC:$dst, - (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>, + (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, Sched<[RR]>; let Constraints = "$src0 = $dst" in def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _dest.RC:$dst, - (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), + (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2), _dest.RC:$src0, _src.KRCWM:$mask))]>, Sched<[RR]>, EVEX_K; def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}", [(set _dest.RC:$dst, - (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), + (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2), _dest.ImmAllZerosV, _src.KRCWM:$mask))]>, Sched<[RR]>, EVEX_KZ; let hasSideEffects = 0, mayStore = 1 in { @@ -8631,17 +8628,17 @@ let Predicates = [HasAVX512] in { } def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst), - (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>; - def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst), - (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>; + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>; + def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -8765,7 +8762,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8859,7 +8856,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8940,7 +8937,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (fsqrt (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -9049,14 +9046,14 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3)))>, + (i32 timm:$src3)))>, Sched<[sched]>; defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", (_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3)))>, EVEX_B, + (i32 timm:$src3)))>, EVEX_B, Sched<[sched]>; defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -9064,7 +9061,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales _.RC:$src1, - _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>, + _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -9082,15 +9079,15 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, } let Predicates = [HasAVX512] in { - def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2), + def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2), (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), - _.FRC:$src1, imm:$src2))>; + _.FRC:$src1, timm:$src2))>; } let Predicates = [HasAVX512, OptForSize] in { - def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2), + def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2), (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), - addr:$src1, imm:$src2))>; + addr:$src1, timm:$src2))>; } } @@ -10109,19 +10106,19 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2))>, Sched<[sched]>; + (i32 timm:$src2))>, Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2))>, + (i32 timm:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", - (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2))>, EVEX_B, + (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10136,7 +10133,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, OpcodeStr##_.Suffix, "$src2, {sae}, $src1", "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2))>, + (i32 timm:$src2))>, EVEX_B, Sched<[sched]>; } @@ -10169,22 +10166,22 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i32 imm:$src3))>, + (i32 timm:$src3))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3))>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src2)), + (i32 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10200,7 +10197,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), (SrcInfo.VT SrcInfo.RC:$src2), - (i8 imm:$src3)))>, + (i8 timm:$src3)))>, Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), @@ -10208,7 +10205,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), (SrcInfo.VT (bitconvert (SrcInfo.LdFrag addr:$src2))), - (i8 imm:$src3)))>, + (i8 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10226,8 +10223,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3))>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src2)), + (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10241,15 +10238,14 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, Sched<[sched]>; defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3))>, + (_.VT _.ScalarIntMemCPat:$src2), + (i32 timm:$src3))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10265,7 +10261,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, EVEX_B, Sched<[sched]>; } @@ -10279,7 +10275,7 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3))>, + (i32 timm:$src3))>, EVEX_B, Sched<[sched]>; } @@ -10401,7 +10397,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2, - (i8 imm:$src3)))))>, + (i8 timm:$src3)))))>, Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), @@ -10410,7 +10406,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, (CastInfo.LdFrag addr:$src2), - (i8 imm:$src3)))))>, + (i8 timm:$src3)))))>, Sched<[sched.Folded, sched.ReadAfterFold]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -10421,8 +10417,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - (i8 imm:$src3)))))>, EVEX_B, + (_.BroadcastLdFrag addr:$src2), + (i8 timm:$src3)))))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10491,14 +10487,14 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>, + (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>, Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)), - (i8 imm:$src3)))>, + (i8 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>, EVEX2VEXOverride<"VPALIGNRrmi">; @@ -10507,8 +10503,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (X86VAlign _.RC:$src1, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3))>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src2)), + (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10541,13 +10537,13 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", // Fragments to help convert valignq into masked valignd. Or valignq/valignd // into vpalignr. -def ValignqImm32XForm : SDNodeXForm<imm, [{ +def ValignqImm32XForm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() * 2, SDLoc(N)); }]>; -def ValignqImm8XForm : SDNodeXForm<imm, [{ +def ValignqImm8XForm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() * 8, SDLoc(N)); }]>; -def ValigndImm8XForm : SDNodeXForm<imm, [{ +def ValigndImm8XForm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() * 4, SDLoc(N)); }]>; @@ -10557,40 +10553,40 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode, def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, From.RC:$src2, - imm:$src3))), + timm:$src3))), To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, To.RC:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, From.RC:$src2, - imm:$src3))), + timm:$src3))), To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask, To.RC:$src1, To.RC:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (From.LdFrag addr:$src2), - imm:$src3))), + timm:$src3))), To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (From.LdFrag addr:$src2), - imm:$src3))), + timm:$src3))), To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; } multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, @@ -10599,35 +10595,32 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, SDNodeXForm ImmXForm> : avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> { def : Pat<(From.VT (OpNode From.RC:$src1, - (bitconvert (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), - imm:$src3)), + (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3)), (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (bitconvert - (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), - imm:$src3))), + (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3))), To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; def : Pat<(To.VT (vselect To.KRCWM:$mask, (bitconvert (From.VT (OpNode From.RC:$src1, (bitconvert - (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), - imm:$src3))), + (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3))), To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask, To.RC:$src1, addr:$src2, - (ImmXForm imm:$src3))>; + (ImmXForm timm:$src3))>; } let Predicates = [HasAVX512] in { @@ -10666,13 +10659,13 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase, + (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1)))))>, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; } @@ -10685,8 +10678,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, - (_.VT (OpNode (X86VBroadcast - (_.ScalarLdFrag addr:$src1))))>, + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; } @@ -10770,7 +10762,7 @@ let Predicates = [HasAVX512, NoVLX] in { multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd, NoVLX] in { - def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))), (EXTRACT_SUBREG (!cast<Instruction>(InstrStr # "Zrr") (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), @@ -10778,7 +10770,7 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, _.info256.SubRegIdx)), _.info256.SubRegIdx)>; - def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1))), (EXTRACT_SUBREG (!cast<Instruction>(InstrStr # "Zrr") (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), @@ -10829,17 +10821,16 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX, + (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX, Sched<[sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", - (_.VT (OpNode (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src)))))>, + (_.VT (_.BroadcastLdFrag addr:$src))>, EVEX, EVEX_CD8<_.EltSize, CD8VH>, Sched<[sched.Folded]>; } @@ -10853,7 +10844,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX512, HasVLX] in { defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM, + defm Z128 : avx512_movddup_128<opc, OpcodeStr, sched.XMM, VTInfo.info128>, EVEX_V128; } } @@ -10867,11 +10858,9 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode, defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>; let Predicates = [HasVLX] in { -def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), - (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), +def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPZ128rm addr:$src)>; @@ -10884,17 +10873,17 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -11070,14 +11059,14 @@ multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, def rr : AVX512<opc, MRMr, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>, Sched<[sched]>; def rm : AVX512<opc, MRMm, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst,(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i8 imm:$src2))))]>, + (i8 timm:$src2))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -11104,6 +11093,7 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _dst, X86VectorVTInfo _src> { + let isCommutable = 1 in def rr : AVX512BI<opc, MRMSrcReg, (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -11140,7 +11130,7 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", // Transforms to swizzle an immediate to enable better matching when // memory operand isn't in the right place. -def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG321_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2. uint8_t Imm = N->getZExtValue(); // Swap bits 1/4 and 3/6. @@ -11151,7 +11141,7 @@ def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x40) NewImm |= 0x08; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG213_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. uint8_t Imm = N->getZExtValue(); // Swap bits 2/4 and 3/5. @@ -11162,7 +11152,7 @@ def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x20) NewImm |= 0x08; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG132_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. uint8_t Imm = N->getZExtValue(); // Swap bits 1/2 and 5/6. @@ -11173,7 +11163,7 @@ def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x40) NewImm |= 0x20; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG231_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by moving operand 1 to the end. uint8_t Imm = N->getZExtValue(); // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5 @@ -11186,7 +11176,7 @@ def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{ if (Imm & 0x40) NewImm |= 0x20; return getI8Imm(NewImm, SDLoc(N)); }]>; -def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{ +def VPTERNLOG312_imm8 : SDNodeXForm<timm, [{ // Convert a VPTERNLOG immediate by moving operand 2 to the beginning. uint8_t Imm = N->getZExtValue(); // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3 @@ -11210,7 +11200,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT _.RC:$src3), - (i8 imm:$src4)), 1, 1>, + (i8 timm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V, Sched<[sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), @@ -11218,7 +11208,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (bitconvert (_.LdFrag addr:$src3))), - (i8 imm:$src4)), 1, 0>, + (i8 timm:$src4)), 1, 0>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -11227,146 +11217,145 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr##", $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - (i8 imm:$src4)), 1, 0>, EVEX_B, + (_.VT (_.BroadcastLdFrag addr:$src3)), + (i8 timm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; }// Constraints = "$src1 = $dst" // Additional patterns for matching passthru operand in other positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)), + (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>; // Additional patterns for matching loads in other positions. def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4))), + _.RC:$src2, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching zero masking with loads in other // positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching masked loads with different // operand orders. def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, - (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)), + (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), - _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>; // Additional patterns for matching broadcasts in other positions. - def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4))), + (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, (i8 timm:$src4))), (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, - addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching zero masking with broadcasts in other // positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, - (VPTERNLOG321_imm8 imm:$src4))>; + (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, - (VPTERNLOG132_imm8 imm:$src4))>; + (VPTERNLOG132_imm8 timm:$src4))>; // Additional patterns for matching masked broadcasts with different // operand orders. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, (i8 imm:$src4)), + (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - (i8 imm:$src4)), _.RC:$src1)), + (_.BroadcastLdFrag addr:$src3), + (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src1, (i8 imm:$src4)), + (_.BroadcastLdFrag addr:$src3), + _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + (OpNode (_.BroadcastLdFrag addr:$src3), + _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, - _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>; } multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched, @@ -11387,6 +11376,113 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU, defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, avx512vl_i64_info>, VEX_W; +// Patterns to use VPTERNLOG for vXi16/vXi8 vectors. +let Predicates = [HasVLX] in { + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, + timm:$src4)>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, + (loadv16i8 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2, + VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, + timm:$src4)>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, + (loadv8i16 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2, + VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, + timm:$src4)>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, + (loadv32i8 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2, + VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, + timm:$src4)>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, + (loadv16i16 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2, + VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, + timm:$src4)>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, + (loadv64i8 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2, + VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3, + (i8 timm:$src4))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, + (loadv32i16 addr:$src3), (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2, + VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; +} + // Patterns to implement vnot using vpternlog instead of creating all ones // using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen // so that the result is only dependent on src0. But we use the same source @@ -11498,14 +11594,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT _.RC:$src3), - (i32 imm:$src4))>, Sched<[sched]>; + (i32 timm:$src4))>, Sched<[sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), - (i32 imm:$src4))>, + (i32 timm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), @@ -11513,8 +11609,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, "$src2, ${src3}"##_.BroadcastStr##", $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), - (i32 imm:$src4))>, + (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)), + (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Constraints = "$src1 = $dst" } @@ -11531,7 +11627,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { (X86VFixupimmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT _.RC:$src3), - (i32 imm:$src4))>, + (i32 timm:$src4))>, EVEX_B, Sched<[sched]>; } } @@ -11547,7 +11643,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (X86VFixupimms (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), - (i32 imm:$src4))>, Sched<[sched]>; + (i32 timm:$src4))>, Sched<[sched]>; defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -11555,7 +11651,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (X86VFixupimmSAEs (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), - (i32 imm:$src4))>, + (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), @@ -11564,13 +11660,13 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, (_.VT _.RC:$src2), (_src3VT.VT (scalar_to_vector (_src3VT.ScalarLdFrag addr:$src3))), - (i32 imm:$src4))>, + (i32 timm:$src4))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched, - AVX512VLVectorVTInfo _Vec, + AVX512VLVectorVTInfo _Vec, AVX512VLVectorVTInfo _Tbl> { let Predicates = [HasAVX512] in defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM, @@ -11804,7 +11900,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -11880,12 +11976,14 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256, let Constraints = "$src1 = $dst" in multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { + X86FoldableSchedWrite sched, X86VectorVTInfo VTI, + bit IsCommutable> { defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, "$src3, $src2", "$src2, $src3", (VTI.VT (OpNode VTI.RC:$src1, - VTI.RC:$src2, VTI.RC:$src3))>, + VTI.RC:$src2, VTI.RC:$src3)), + IsCommutable, IsCommutable>, EVEX_4V, T8PD, Sched<[sched]>; defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, @@ -11899,27 +11997,58 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, OpStr, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast - (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, - X86SchedWriteWidths sched> { + X86SchedWriteWidths sched, bit IsCommutable> { let Predicates = [HasVNNI] in - defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512; + defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info, + IsCommutable>, EVEX_V512; let Predicates = [HasVNNI, HasVLX] in { - defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256; - defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128; + defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info, + IsCommutable>, EVEX_V256; + defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info, + IsCommutable>, EVEX_V128; } } // FIXME: Is there a better scheduler class for VPDP? -defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>; -defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>; -defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>; -defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>; +defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>; +defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>; +defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>; +defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>; + +def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), + (X86vpmaddwd node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; + +// Patterns to match VPDPWSSD from existing instructions/intrinsics. +let Predicates = [HasVNNI] in { + def : Pat<(v16i32 (add VR512:$src1, + (X86vpmaddwd_su VR512:$src2, VR512:$src3))), + (VPDPWSSDZr VR512:$src1, VR512:$src2, VR512:$src3)>; + def : Pat<(v16i32 (add VR512:$src1, + (X86vpmaddwd_su VR512:$src2, (load addr:$src3)))), + (VPDPWSSDZm VR512:$src1, VR512:$src2, addr:$src3)>; +} +let Predicates = [HasVNNI,HasVLX] in { + def : Pat<(v8i32 (add VR256X:$src1, + (X86vpmaddwd_su VR256X:$src2, VR256X:$src3))), + (VPDPWSSDZ256r VR256X:$src1, VR256X:$src2, VR256X:$src3)>; + def : Pat<(v8i32 (add VR256X:$src1, + (X86vpmaddwd_su VR256X:$src2, (load addr:$src3)))), + (VPDPWSSDZ256m VR256X:$src1, VR256X:$src2, addr:$src3)>; + def : Pat<(v4i32 (add VR128X:$src1, + (X86vpmaddwd_su VR128X:$src2, VR128X:$src3))), + (VPDPWSSDZ128r VR128X:$src1, VR128X:$src2, VR128X:$src3)>; + def : Pat<(v4i32 (add VR128X:$src1, + (X86vpmaddwd_su VR128X:$src2, (load addr:$src3)))), + (VPDPWSSDZ128m VR128X:$src1, VR128X:$src2, addr:$src3)>; +} //===----------------------------------------------------------------------===// // Bit Algorithms @@ -12004,8 +12133,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode, OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", (OpNode (VTI.VT VTI.RC:$src1), - (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))), - (i8 imm:$src3))>, EVEX_B, + (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))), + (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12116,7 +12245,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> { !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), [(set _.KRPC:$dst, (X86vp2intersect - _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, + _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>, EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; } @@ -12217,12 +12346,12 @@ let Predicates = [HasBF16, HasVLX] in { (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 - (X86VBroadcast (loadf32 addr:$src))))), + (X86VBroadcastld32 addr:$src)))), (VCVTNEPS2BF16Z128rmb addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), (v8i16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), v8i16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; } @@ -12249,7 +12378,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr), (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>, + (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>, EVEX_B, EVEX_4V; } diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index e52635f8d48b..1e399a894490 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -1271,22 +1271,22 @@ let isCompare = 1 in { // ANDN Instruction // multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, - PatFrag ld_frag> { + PatFrag ld_frag, X86FoldableSchedWrite sched> { def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>, - Sched<[WriteALU]>; + Sched<[sched]>; def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>, - Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>; } // Complexity is reduced to give and with immediate a chance to match first. let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in { - defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V; - defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W; + defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V; + defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, VEX_W; } let Predicates = [HasBMI], AddedComplexity = -6 in { diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index 50aed98112c3..aa45e9b191c1 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -131,11 +131,11 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) { /// reference. static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg) { - // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg. - MI->getOperand(Operand).setReg(Reg); + // Direct memory address is in a form of: Reg/FI, 1 (Scale), NoReg, 0, NoReg. + MI->getOperand(Operand).ChangeToRegister(Reg, /*isDef=*/false); MI->getOperand(Operand + 1).setImm(1); MI->getOperand(Operand + 2).setReg(0); - MI->getOperand(Operand + 3).setImm(0); + MI->getOperand(Operand + 3).ChangeToImmediate(0); MI->getOperand(Operand + 4).setReg(0); } diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index 099f6aa8d8bb..330b8c7a8a43 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -20,19 +20,19 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", : I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond), "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, - (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>, + (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>, TB, OpSize16; def CMOV32rr : I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond), "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, - (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>, + (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>, TB, OpSize32; def CMOV64rr :RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond), "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, - (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB; + (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB; } let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", @@ -41,29 +41,46 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", : I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond), "cmov${cond}{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), - imm:$cond, EFLAGS))]>, TB, OpSize16; + timm:$cond, EFLAGS))]>, TB, OpSize16; def CMOV32rm : I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond), "cmov${cond}{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), - imm:$cond, EFLAGS))]>, TB, OpSize32; + timm:$cond, EFLAGS))]>, TB, OpSize32; def CMOV64rm :RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond), "cmov${cond}{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), - imm:$cond, EFLAGS))]>, TB; + timm:$cond, EFLAGS))]>, TB; } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" } // isCodeGenOnly = 1, ForceDisassemble = 1 +def inv_cond_XFORM : SDNodeXForm<imm, [{ + X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue()); + return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC), + SDLoc(N), MVT::i8); +}]>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS), + (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS), + (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS), + (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>; +} + // SetCC instructions. let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in { def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond), "set${cond}\t$dst", - [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>, + [(set GR8:$dst, (X86setcc timm:$cond, EFLAGS))]>, TB, Sched<[WriteSETCC]>; def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond), "set${cond}\t$dst", - [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>, + [(store (X86setcc timm:$cond, EFLAGS), addr:$dst)]>, TB, Sched<[WriteSETCCStore]>; } // Uses = [EFLAGS] diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index efaccdc9ee96..78d8dd3c0d03 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -542,7 +542,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { def CMOV#NAME : I<0, Pseudo, (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), "#CMOV_"#NAME#" PSEUDO!", - [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond, EFLAGS)))]>; } @@ -593,66 +593,66 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _VK64 : CMOVrr_PSEUDO<VK64, v64i1>; } // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] -def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; +def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; let Predicates = [NoVLX] in { - def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)), - (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>; - - def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; - def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)), - (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>; + def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + + def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; } let Predicates = [HasVLX] in { - def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)), - (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>; - - def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; - def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)), - (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>; + def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + + def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; } -def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; -def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)), - (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>; +def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions @@ -1126,12 +1126,12 @@ def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))), // binary size compared to a regular MOV, but it introduces an unnecessary // load, so is not suitable for regular or optsize functions. let Predicates = [OptForMinSize] in { -def : Pat<(nonvolatile_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>; -def : Pat<(nonvolatile_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>; -def : Pat<(nonvolatile_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>; -def : Pat<(nonvolatile_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>; -def : Pat<(nonvolatile_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>; -def : Pat<(nonvolatile_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>; +def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>; +def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>; +def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>; +def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>; +def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>; +def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>; } // In kernel code model, we can get the address of a label @@ -1276,23 +1276,6 @@ def : Pat<(X86cmp GR32:$src1, 0), def : Pat<(X86cmp GR64:$src1, 0), (TEST64rr GR64:$src1, GR64:$src1)>; -def inv_cond_XFORM : SDNodeXForm<imm, [{ - X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue()); - return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC), - SDLoc(N), MVT::i8); -}]>; - -// Conditional moves with folded loads with operands swapped and conditions -// inverted. -let Predicates = [HasCMov] in { - def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS), - (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; - def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS), - (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; - def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS), - (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>; -} - // zextload bool -> zextload byte // i1 stored in one byte in zero-extended form. // Upper bits cleanup should be executed before Store. diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index f82e80965b7c..e1e6eea59884 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -75,7 +75,7 @@ let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump], def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs), (ins brtarget8:$dst, ccode:$cond), "j${cond}\t$dst", - [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>; + [(X86brcond bb:$dst, timm:$cond, EFLAGS)]>; let hasSideEffects = 0 in { def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs), (ins brtarget16:$dst, ccode:$cond), @@ -145,6 +145,17 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; + // Win64 wants indirect jumps leaving the function to have a REX_W prefix. + // These are switched from TAILJMPr/m64_REX in MCInstLower. + let isCodeGenOnly = 1, hasREX_WPrefix = 1 in { + def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst), + "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>; + let mayLoad = 1 in + def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), + "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>; + + } + // Non-tracking jumps for IBT, use with caution. let isCodeGenOnly = 1 in { def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst", @@ -273,39 +284,35 @@ let isCall = 1 in // Tail call stuff. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in - let Uses = [ESP, SSP] in { - def TCRETURNdi : PseudoI<(outs), - (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable; - def TCRETURNri : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; + isCodeGenOnly = 1, Uses = [ESP, SSP] in { + def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>, NotMemoryFoldable; + def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>, NotMemoryFoldable; let mayLoad = 1 in - def TCRETURNmi : PseudoI<(outs), - (ins i32mem_TC:$dst, i32imm:$offset), []>; + def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset), + []>, Sched<[WriteJumpLd]>; - // FIXME: The should be pseudo instructions that are lowered when going to - // mcinst. - def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), - (ins i32imm_pcrel:$dst), "jmp\t$dst", []>; + def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst), + []>, Sched<[WriteJump]>; - def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "", []>; // FIXME: Remove encoding when JIT is dead. + def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), + []>, Sched<[WriteJump]>; let mayLoad = 1 in - def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), - "jmp{l}\t{*}$dst", []>; + def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst), + []>, Sched<[WriteJumpLd]>; } // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, - isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + isCodeGenOnly = 1, SchedRW = [WriteJump] in let Uses = [ESP, EFLAGS, SSP] in { def TCRETURNdicc : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; // This gets substituted to a conditional jump instruction in MC lowering. - def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs), - (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>; + def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>; } @@ -348,34 +355,36 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { + isCodeGenOnly = 1, Uses = [RSP, SSP] in { def TCRETURNdi64 : PseudoI<(outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset), - []>; + (ins i64i32imm_pcrel:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>; def TCRETURNri64 : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; + (ins ptr_rc_tailcall:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>, NotMemoryFoldable; let mayLoad = 1 in def TCRETURNmi64 : PseudoI<(outs), - (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable; + (ins i64mem_TC:$dst, i32imm:$offset), + []>, Sched<[WriteJumpLd]>, NotMemoryFoldable; - def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), - "jmp\t$dst", []>; + def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst), + []>, Sched<[WriteJump]>; - def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "jmp{q}\t{*}$dst", []>; + def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), + []>, Sched<[WriteJump]>; let mayLoad = 1 in - def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), - "jmp{q}\t{*}$dst", []>; + def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst), + []>, Sched<[WriteJumpLd]>; // Win64 wants indirect jumps leaving the function to have a REX_W prefix. let hasREX_WPrefix = 1 in { - def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), - "rex64 jmp{q}\t{*}$dst", []>; + def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst), + []>, Sched<[WriteJump]>; let mayLoad = 1 in - def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), - "rex64 jmp{q}\t{*}$dst", []>; + def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst), + []>, Sched<[WriteJumpLd]>; } } @@ -403,13 +412,13 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, // Conditional tail calls are similar to the above, but they are branches // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, - isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + isCodeGenOnly = 1, SchedRW = [WriteJump] in let Uses = [RSP, EFLAGS, SSP] in { def TCRETURNdi64cc : PseudoI<(outs), (ins i64i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; // This gets substituted to a conditional jump instruction in MC lowering. - def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs), - (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>; + def TAILJMPd64_CC : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>; } diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 06e605fe5db2..7a4eb138ec34 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -17,19 +17,18 @@ let hasSideEffects = 0 in { let Defs = [EAX], Uses = [AX] in // EAX = signext(AX) def CWDE : I<0x98, RawFrm, (outs), (ins), "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>; + let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) + def CDQE : RI<0x98, RawFrm, (outs), (ins), + "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; + // FIXME: CWD/CDQ/CQO shouldn't Def the A register, but the fast register + // allocator crashes if you remove it. let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX) def CWD : I<0x99, RawFrm, (outs), (ins), "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>; let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX) def CDQ : I<0x99, RawFrm, (outs), (ins), "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>; - - - let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) - def CDQE : RI<0x98, RawFrm, (outs), (ins), - "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; - let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX) def CQO : RI<0x99, RawFrm, (outs), (ins), "{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>; diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp index d42fec3770c7..f3b286e0375c 100644 --- a/lib/Target/X86/X86InstrFoldTables.cpp +++ b/lib/Target/X86/X86InstrFoldTables.cpp @@ -292,6 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = { { X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD }, { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, { X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD }, + { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE }, { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, @@ -5245,6 +5247,270 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = { { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, }; +static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { + { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD }, + { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS }, + { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS }, + { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD }, + { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD }, + { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD }, + { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS }, + { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS }, + { X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS }, + { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD }, + { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD }, + { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD }, + { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS }, + { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS }, + { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS }, + { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD }, + { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD }, + { X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD }, + { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS }, + { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS }, + { X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS }, + { X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD }, + { X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD }, + { X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD }, + { X86::VMAXPSZ128rr, X86::VMAXPSZ128rmb, TB_BCAST_SS }, + { X86::VMAXPSZ256rr, X86::VMAXPSZ256rmb, TB_BCAST_SS }, + { X86::VMAXPSZrr, X86::VMAXPSZrmb, TB_BCAST_SS }, + { X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD }, + { X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD }, + { X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD }, + { X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS }, + { X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS }, + { X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS }, + { X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD }, + { X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD }, + { X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD }, + { X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS }, + { X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS }, + { X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS }, + { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD }, + { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD }, + { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD }, + { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS }, + { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS }, + { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS }, + { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D }, + { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D }, + { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D }, + { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q }, + { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q }, + { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q }, + { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D }, + { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D }, + { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D }, + { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D }, + { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D }, + { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D }, + { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q }, + { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q }, + { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q }, + { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q }, + { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q }, + { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q }, + { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmib, TB_BCAST_D }, + { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmib, TB_BCAST_D }, + { X86::VPCMPDZrri, X86::VPCMPDZrmib, TB_BCAST_D }, + { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D }, + { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D }, + { X86::VPCMPEQDZrr, X86::VPCMPEQDZrmb, TB_BCAST_D }, + { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rmb, TB_BCAST_Q }, + { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rmb, TB_BCAST_Q }, + { X86::VPCMPEQQZrr, X86::VPCMPEQQZrmb, TB_BCAST_Q }, + { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rmb, TB_BCAST_D }, + { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rmb, TB_BCAST_D }, + { X86::VPCMPGTDZrr, X86::VPCMPGTDZrmb, TB_BCAST_D }, + { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q }, + { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q }, + { X86::VPCMPGTQZrr, X86::VPCMPGTQZrmb, TB_BCAST_Q }, + { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmib, TB_BCAST_Q }, + { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmib, TB_BCAST_Q }, + { X86::VPCMPQZrri, X86::VPCMPQZrmib, TB_BCAST_Q }, + { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D }, + { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D }, + { X86::VPCMPUDZrri, X86::VPCMPUDZrmib, TB_BCAST_D }, + { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q }, + { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q }, + { X86::VPCMPUQZrri, X86::VPCMPUQZrmib, TB_BCAST_Q }, + { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rmb, TB_BCAST_D }, + { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rmb, TB_BCAST_D }, + { X86::VPMAXSDZrr, X86::VPMAXSDZrmb, TB_BCAST_D }, + { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rmb, TB_BCAST_Q }, + { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rmb, TB_BCAST_Q }, + { X86::VPMAXSQZrr, X86::VPMAXSQZrmb, TB_BCAST_Q }, + { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rmb, TB_BCAST_D }, + { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rmb, TB_BCAST_D }, + { X86::VPMAXUDZrr, X86::VPMAXUDZrmb, TB_BCAST_D }, + { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rmb, TB_BCAST_Q }, + { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rmb, TB_BCAST_Q }, + { X86::VPMAXUQZrr, X86::VPMAXUQZrmb, TB_BCAST_Q }, + { X86::VPMINSDZ128rr, X86::VPMINSDZ128rmb, TB_BCAST_D }, + { X86::VPMINSDZ256rr, X86::VPMINSDZ256rmb, TB_BCAST_D }, + { X86::VPMINSDZrr, X86::VPMINSDZrmb, TB_BCAST_D }, + { X86::VPMINSQZ128rr, X86::VPMINSQZ128rmb, TB_BCAST_Q }, + { X86::VPMINSQZ256rr, X86::VPMINSQZ256rmb, TB_BCAST_Q }, + { X86::VPMINSQZrr, X86::VPMINSQZrmb, TB_BCAST_Q }, + { X86::VPMINUDZ128rr, X86::VPMINUDZ128rmb, TB_BCAST_D }, + { X86::VPMINUDZ256rr, X86::VPMINUDZ256rmb, TB_BCAST_D }, + { X86::VPMINUDZrr, X86::VPMINUDZrmb, TB_BCAST_D }, + { X86::VPMINUQZ128rr, X86::VPMINUQZ128rmb, TB_BCAST_Q }, + { X86::VPMINUQZ256rr, X86::VPMINUQZ256rmb, TB_BCAST_Q }, + { X86::VPMINUQZrr, X86::VPMINUQZrmb, TB_BCAST_Q }, + { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D }, + { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D }, + { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D }, + { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q }, + { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q }, + { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q }, + { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D }, + { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D }, + { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D }, + { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q }, + { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q }, + { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q }, + { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D }, + { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D }, + { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D }, + { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q }, + { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q }, + { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q }, + { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D }, + { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D }, + { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D }, + { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q }, + { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q }, + { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q }, + { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D }, + { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D }, + { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D }, + { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q }, + { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q }, + { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q }, + { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD }, + { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD }, + { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD }, + { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS }, + { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS }, + { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS }, +}; + +static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = { + { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD }, + { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS }, + { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD }, + { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS }, + { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD }, + { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD231PSZr, X86::VFMADD231PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS }, + { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD }, + { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS }, + { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD }, + { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS }, + { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD }, + { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB231PSZr, X86::VFMSUB231PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZmb, TB_BCAST_SS }, + { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD }, + { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS }, + { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD }, + { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS }, + { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD }, + { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS }, +}; + static const X86MemoryFoldTableEntry * lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { #ifndef NDEBUG @@ -5287,6 +5553,18 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { std::end(MemoryFoldTable4)) == std::end(MemoryFoldTable4) && "MemoryFoldTable4 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) && + std::adjacent_find(std::begin(BroadcastFoldTable2), + std::end(BroadcastFoldTable2)) == + std::end(BroadcastFoldTable2) && + "BroadcastFoldTable2 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable3), + std::end(BroadcastFoldTable3)) && + std::adjacent_find(std::begin(BroadcastFoldTable3), + std::end(BroadcastFoldTable3)) == + std::end(BroadcastFoldTable3) && + "BroadcastFoldTable3 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } #endif @@ -5355,6 +5633,15 @@ struct X86MemUnfoldTable { // Index 4, folded load addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD); + // Broadcast tables. + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + // Sort the memory->reg unfold table. array_pod_sort(Table.begin(), Table.end()); diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h index 419baf98f61d..7dc236a0d7e4 100644 --- a/lib/Target/X86/X86InstrFoldTables.h +++ b/lib/Target/X86/X86InstrFoldTables.h @@ -19,35 +19,48 @@ namespace llvm { enum { // Select which memory operand is being unfolded. - // (stored in bits 0 - 3) + // (stored in bits 0 - 2) TB_INDEX_0 = 0, TB_INDEX_1 = 1, TB_INDEX_2 = 2, TB_INDEX_3 = 3, TB_INDEX_4 = 4, - TB_INDEX_MASK = 0xf, + TB_INDEX_MASK = 0x7, // Do not insert the reverse map (MemOp -> RegOp) into the table. // This may be needed because there is a many -> one mapping. - TB_NO_REVERSE = 1 << 4, + TB_NO_REVERSE = 1 << 3, // Do not insert the forward map (RegOp -> MemOp) into the table. // This is needed for Native Client, which prohibits branch // instructions from using a memory operand. - TB_NO_FORWARD = 1 << 5, + TB_NO_FORWARD = 1 << 4, - TB_FOLDED_LOAD = 1 << 6, - TB_FOLDED_STORE = 1 << 7, + TB_FOLDED_LOAD = 1 << 5, + TB_FOLDED_STORE = 1 << 6, + TB_FOLDED_BCAST = 1 << 7, // Minimum alignment required for load/store. - // Used for RegOp->MemOp conversion. - // (stored in bits 8 - 15) + // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0 + // to mean align of 0. + // (stored in bits 8 - 11) TB_ALIGN_SHIFT = 8, - TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, - TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, - TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, - TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, - TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 5 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 6 << TB_ALIGN_SHIFT, + TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT, + + // Broadcast type. + // (stored in bits 12 - 13) + TB_BCAST_TYPE_SHIFT = 12, + TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT, + TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT, + + // Unused bits 14-15 }; // This struct is used for both the folding and unfold tables. They KeyOp diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 096cc27861ca..de6f8a81dff6 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,6 +103,8 @@ def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -954,6 +956,26 @@ def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr), return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; }]>; +def X86VBroadcastld8 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 1; +}]>; + +def X86VBroadcastld16 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2; +}]>; + +def X86VBroadcastld32 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4; +}]>; + +def X86VBroadcastld64 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; +}]>; + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); @@ -963,6 +985,10 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{ return N->isExactlyValue(+0.0); }]>; +def fp128imm0 : PatLeaf<(f128 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index // to VEXTRACTF128/VEXTRACTI128 imm. def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index dbe45356c42b..c29029daeec9 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -30,7 +30,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -465,7 +465,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r. static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. - if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) + if (!Register::isVirtualRegister(BaseReg)) return false; bool isPICBase = false; for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), @@ -480,9 +480,50 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { } bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const { + AAResults *AA) const { switch (MI.getOpcode()) { - default: break; + default: + // This function should only be called for opcodes with the ReMaterializable + // flag set. + llvm_unreachable("Unknown rematerializable operation!"); + break; + + case X86::LOAD_STACK_GUARD: + case X86::AVX1_SETALLONES: + case X86::AVX2_SETALLONES: + case X86::AVX512_128_SET0: + case X86::AVX512_256_SET0: + case X86::AVX512_512_SET0: + case X86::AVX512_512_SETALLONES: + case X86::AVX512_FsFLD0SD: + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0F128: + case X86::AVX_SET0: + case X86::FsFLD0SD: + case X86::FsFLD0SS: + case X86::FsFLD0F128: + case X86::KSET0D: + case X86::KSET0Q: + case X86::KSET0W: + case X86::KSET1D: + case X86::KSET1Q: + case X86::KSET1W: + case X86::MMX_SET0: + case X86::MOV32ImmSExti8: + case X86::MOV32r0: + case X86::MOV32r1: + case X86::MOV32r_1: + case X86::MOV32ri64: + case X86::MOV64ImmSExti8: + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::MOV16ri: + case X86::MOV32ri: + case X86::MOV64ri: + case X86::MOV64ri32: + case X86::MOV8ri: + return true; + case X86::MOV8rm: case X86::MOV8rm_NOREX: case X86::MOV16rm: @@ -561,7 +602,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, MI.getOperand(1 + X86::AddrIndexReg).isReg() && MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && MI.isDereferenceableInvariantLoad(AA)) { - unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); + Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; // Allow re-materialization of PIC load. @@ -583,7 +624,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, // lea fi#, lea GV, etc. are all rematerializable. if (!MI.getOperand(1 + X86::AddrBaseReg).isReg()) return true; - unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); + Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0) return true; // Allow re-materialization of lea PICBase + x. @@ -594,10 +635,6 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, return false; } } - - // All other instructions marked M_REMATERIALIZABLE are always trivially - // rematerializable. - return true; } void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, @@ -664,7 +701,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { } bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, - unsigned Opc, bool AllowSP, unsigned &NewSrc, + unsigned Opc, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const { MachineFunction &MF = *MI.getParent()->getParent(); @@ -675,7 +712,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; } - unsigned SrcReg = Src.getReg(); + Register SrcReg = Src.getReg(); // For both LEA64 and LEA32 the register already has essentially the right // type (32-bit or 64-bit) we may just need to forbid SP. @@ -684,7 +721,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, isKill = Src.isKill(); assert(!Src.isUndef() && "Undef op doesn't need optimization"); - if (TargetRegisterInfo::isVirtualRegister(NewSrc) && + if (Register::isVirtualRegister(NewSrc) && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) return false; @@ -693,7 +730,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, // This is for an LEA64_32r and incoming registers are 32-bit. One way or // another we need to add 64-bit registers to the final MI. - if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + if (Register::isPhysicalRegister(SrcReg)) { ImplicitOp = Src; ImplicitOp.setImplicit(); @@ -740,8 +777,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( return nullptr; unsigned Opcode = X86::LEA64_32r; - unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); - unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); + Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); + Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass); // Build and insert into an implicit UNDEF value. This is OK because // we will be shifting and then extracting the lower 8/16-bits. @@ -751,8 +788,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( // But testing has shown this *does* help performance in 64-bit mode (at // least on modern x86 machines). MachineBasicBlock::iterator MBBI = MI.getIterator(); - unsigned Dest = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); + Register Dest = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); bool IsDead = MI.getOperand(0).isDead(); bool IsKill = MI.getOperand(1).isKill(); unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit; @@ -794,7 +831,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( case X86::ADD8rr_DB: case X86::ADD16rr: case X86::ADD16rr_DB: { - unsigned Src2 = MI.getOperand(2).getReg(); + Register Src2 = MI.getOperand(2).getReg(); bool IsKill2 = MI.getOperand(2).isKill(); assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization"); unsigned InRegLEA2 = 0; @@ -888,7 +925,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. - if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && + if (Register::isVirtualRegister(Src.getReg()) && !MF.getRegInfo().constrainRegClass(Src.getReg(), &X86::GR64_NOSPRegClass)) return nullptr; @@ -911,7 +948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, // LEA can't handle ESP. bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) @@ -947,7 +984,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) @@ -970,7 +1007,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, : (Is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, ImplicitOp, LV)) @@ -1005,7 +1042,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) @@ -1013,7 +1050,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, const MachineOperand &Src2 = MI.getOperand(2); bool isKill2; - unsigned SrcReg2; + Register SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, SrcReg2, isKill2, ImplicitOp2, LV)) @@ -1054,7 +1091,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) @@ -1085,6 +1122,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return nullptr; case X86::SUB32ri8: case X86::SUB32ri: { + if (!MI.getOperand(2).isImm()) + return nullptr; int64_t Imm = MI.getOperand(2).getImm(); if (!isInt<32>(-Imm)) return nullptr; @@ -1093,7 +1132,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill; - unsigned SrcReg; + Register SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, ImplicitOp, LV)) @@ -1111,6 +1150,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SUB64ri8: case X86::SUB64ri32: { + if (!MI.getOperand(2).isImm()) + return nullptr; int64_t Imm = MI.getOperand(2).getImm(); if (!isInt<32>(-Imm)) return nullptr; @@ -1140,40 +1181,62 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk: case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: - case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: { + case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: + case X86::VBROADCASTSDZ256mk: + case X86::VBROADCASTSDZmk: + case X86::VBROADCASTSSZ128mk: + case X86::VBROADCASTSSZ256mk: + case X86::VBROADCASTSSZmk: + case X86::VPBROADCASTDZ128mk: + case X86::VPBROADCASTDZ256mk: + case X86::VPBROADCASTDZmk: + case X86::VPBROADCASTQZ128mk: + case X86::VPBROADCASTQZ256mk: + case X86::VPBROADCASTQZmk: { unsigned Opc; switch (MIOpc) { default: llvm_unreachable("Unreachable!"); - case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; - case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; - case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; - case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; - case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; - case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; - case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; - case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; - case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; - case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; - case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; - case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; - case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; - case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; - case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; - case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; - case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; - case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; - case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; - case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; - case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; - case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; - case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; - case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; - case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; - case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; - case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; - case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; - case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; - case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; + case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; + case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; + case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; + case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; + case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; + case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break; + case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break; + case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break; + case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break; + case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break; + case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break; + case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break; + case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break; + case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break; + case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break; + case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break; } NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) @@ -1187,6 +1250,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .add(MI.getOperand(7)); break; } + case X86::VMOVDQU8Z128rrk: case X86::VMOVDQU8Z256rrk: case X86::VMOVDQU8Zrrk: @@ -1683,6 +1747,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::VCMPSDZrr: + case X86::VCMPSSZrr: + case X86::VCMPPDZrri: + case X86::VCMPPSZrri: + case X86::VCMPPDZ128rri: + case X86::VCMPPSZ128rri: + case X86::VCMPPDZ256rri: + case X86::VCMPPSZ256rri: + case X86::VCMPPDZrrik: + case X86::VCMPPSZrrik: + case X86::VCMPPDZ128rrik: + case X86::VCMPPSZ128rrik: + case X86::VCMPPDZ256rrik: + case X86::VCMPPSZ256rrik: { + unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f; + Imm = X86::getSwappedVCMPImm(Imm); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::VPERM2F128rr: case X86::VPERM2I128rr: { // Flip permute source immediate. @@ -1859,7 +1944,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. - unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); + Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); unsigned CommutableOpIdx1; for (CommutableOpIdx1 = LastCommutableVecOp; @@ -1889,7 +1974,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, return true; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, +bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { const MCInstrDesc &Desc = MI.getDesc(); if (!Desc.isCommutable()) @@ -1926,17 +2012,23 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7; switch (Imm) { + default: + // EVEX versions can be commuted. + if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX) + break; + return false; case 0x00: // EQUAL case 0x03: // UNORDERED case 0x04: // NOT EQUAL case 0x07: // ORDERED - // The indices of the commutable operands are 1 and 2 (or 2 and 3 - // when masked). - // Assign them to the returned operand indices here. - return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, - 2 + OpOffset); + break; } - return false; + + // The indices of the commutable operands are 1 and 2 (or 2 and 3 + // when masked). + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset, + 2 + OpOffset); } case X86::MOVSSrr: // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can @@ -1990,6 +2082,24 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + case X86::VPDPWSSDZ128r: + case X86::VPDPWSSDZ128rk: + case X86::VPDPWSSDZ128rkz: + case X86::VPDPWSSDZ256r: + case X86::VPDPWSSDZ256rk: + case X86::VPDPWSSDZ256rkz: + case X86::VPDPWSSDZr: + case X86::VPDPWSSDZrk: + case X86::VPDPWSSDZrkz: + case X86::VPDPWSSDSZ128r: + case X86::VPDPWSSDSZ128rk: + case X86::VPDPWSSDSZ128rkz: + case X86::VPDPWSSDSZ256r: + case X86::VPDPWSSDSZ256rk: + case X86::VPDPWSSDSZ256rkz: + case X86::VPDPWSSDSZr: + case X86::VPDPWSSDSZrk: + case X86::VPDPWSSDSZrkz: case X86::VPMADD52HUQZ128r: case X86::VPMADD52HUQZ128rk: case X86::VPMADD52HUQZ128rkz: @@ -2215,7 +2325,7 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) { } } -/// Get the VPCMP immediate if the opcodes are swapped. +/// Get the VPCMP immediate if the operands are swapped. unsigned X86::getSwappedVPCMPImm(unsigned Imm) { switch (Imm) { default: llvm_unreachable("Unreachable!"); @@ -2233,7 +2343,7 @@ unsigned X86::getSwappedVPCMPImm(unsigned Imm) { return Imm; } -/// Get the VPCOM immediate if the opcodes are swapped. +/// Get the VPCOM immediate if the operands are swapped. unsigned X86::getSwappedVPCOMImm(unsigned Imm) { switch (Imm) { default: llvm_unreachable("Unreachable!"); @@ -2251,6 +2361,23 @@ unsigned X86::getSwappedVPCOMImm(unsigned Imm) { return Imm; } +/// Get the VCMP immediate if the operands are swapped. +unsigned X86::getSwappedVCMPImm(unsigned Imm) { + // Only need the lower 2 bits to distinquish. + switch (Imm & 0x3) { + default: llvm_unreachable("Unreachable!"); + case 0x00: case 0x03: + // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted. + break; + case 0x01: case 0x02: + // Need to toggle bits 3:0. Bit 4 stays the same. + Imm ^= 0xf; + break; + } + + return Imm; +} + bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const { if (!MI.isTerminator()) return false; @@ -3131,25 +3258,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(isKill)); } -void X86InstrInfo::storeRegToAddr( - MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const { - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); - DebugLoc DL; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.add(Addr[i]); - MIB.addReg(SrcReg, getKillRegState(isKill)); - MIB.setMemRefs(MMOs); - NewMIs.push_back(MIB); -} - - void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIdx, @@ -3164,23 +3272,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); } -void X86InstrInfo::loadRegFromAddr( - MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const { - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); - DebugLoc DL; - MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); - for (unsigned i = 0, e = Addr.size(); i != e; ++i) - MIB.add(Addr[i]); - MIB.setMemRefs(MMOs); - NewMIs.push_back(MIB); -} - bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { @@ -3599,8 +3690,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (!IsCmpZero && !Sub) return false; - bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && - Sub->getOperand(2).getReg() == SrcReg); + bool IsSwapped = + (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg); // Scan forward from the instruction after CmpInstr for uses of EFLAGS. // It is safe to remove CmpInstr if EFLAGS is redefined or killed. @@ -3755,7 +3847,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); + Register Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; // Do not fold if we have a subreg use or a def. @@ -3785,7 +3877,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any @@ -3815,7 +3907,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); // Insert the XOR. BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) @@ -3891,7 +3983,7 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); auto Flags = MachineMemOperand::MOLoad | @@ -3929,7 +4021,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx) { - unsigned DestReg = MIB->getOperand(0).getReg(); + Register DestReg = MIB->getOperand(0).getReg(); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(DestReg) < 16) { // We can use a normal VEX encoded load. @@ -3952,7 +4044,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx) { - unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); + Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(SrcReg) < 16) { // We can use a normal VEX encoded store. @@ -4008,12 +4100,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: + case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { assert(HasAVX && "AVX not supported"); const TargetRegisterInfo *TRI = &getRegisterInfo(); - unsigned SrcReg = MIB->getOperand(0).getReg(); - unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); + Register SrcReg = MIB->getOperand(0).getReg(); + Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(X86::VXORPSrr)); MIB.addReg(SrcReg, RegState::ImplicitDefine); @@ -4021,9 +4114,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case X86::AVX512_128_SET0: case X86::AVX512_FsFLD0SS: - case X86::AVX512_FsFLD0SD: { + case X86::AVX512_FsFLD0SD: + case X86::AVX512_FsFLD0F128: { bool HasVLX = Subtarget.hasVLX(); - unsigned SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB->getOperand(0).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) return Expand2AddrUndef(MIB, @@ -4037,10 +4131,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: { bool HasVLX = Subtarget.hasVLX(); - unsigned SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB->getOperand(0).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) { - unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); + Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); @@ -4060,14 +4154,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); case X86::AVX1_SETALLONES: { - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. MIB->setDesc(get(X86::VCMPPSYrri)); MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); return true; } case X86::AVX512_512_SETALLONES: { - unsigned Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB->getOperand(0).getReg(); MIB->setDesc(get(X86::VPTERNLOGDZrri)); // VPTERNLOGD needs 3 register inputs and an immediate. // 0xff will return 1s for any input. @@ -4077,8 +4171,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case X86::AVX512_512_SEXT_MASK_32: case X86::AVX512_512_SEXT_MASK_64: { - unsigned Reg = MIB->getOperand(0).getReg(); - unsigned MaskReg = MIB->getOperand(1).getReg(); + Register Reg = MIB->getOperand(0).getReg(); + Register MaskReg = MIB->getOperand(1).getReg(); unsigned MaskState = getRegState(MIB->getOperand(1)); unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; @@ -4115,8 +4209,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::MOV32ri64: { - unsigned Reg = MIB->getOperand(0).getReg(); - unsigned Reg32 = RI.getSubReg(Reg, X86::sub_32bit); + Register Reg = MIB->getOperand(0).getReg(); + Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit); MI.setDesc(get(X86::MOV32ri)); MIB->getOperand(0).setReg(Reg32); MIB.addReg(Reg, RegState::ImplicitDefine); @@ -4251,8 +4345,8 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // If MI is marked as reading Reg, the partial register update is wanted. const MachineOperand &MO = MI.getOperand(0); - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + Register Reg = MO.getReg(); + if (Register::isVirtualRegister(Reg)) { if (MO.readsReg() || MI.readsVirtualRegister(Reg)) return 0; } else { @@ -4268,7 +4362,10 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. -static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { +static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum, + bool ForLoadFold = false) { + // Set the OpNum parameter to the first source operand. + OpNum = 1; switch (Opcode) { case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: @@ -4427,6 +4524,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: return true; + case X86::VMOVSSZrrk: + case X86::VMOVSDZrrk: + OpNum = 3; + return true; + case X86::VMOVSSZrrkz: + case X86::VMOVSDZrrkz: + OpNum = 2; + return true; } return false; @@ -4449,14 +4554,11 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) { unsigned X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, const TargetRegisterInfo *TRI) const { - if (!hasUndefRegUpdate(MI.getOpcode())) + if (!hasUndefRegUpdate(MI.getOpcode(), OpNum)) return 0; - // Set the OpNum parameter to the first source operand. - OpNum = 1; - const MachineOperand &MO = MI.getOperand(OpNum); - if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) { return UndefRegClearance; } return 0; @@ -4464,7 +4566,7 @@ X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, void X86InstrInfo::breakPartialRegDependency( MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - unsigned Reg = MI.getOperand(OpNum).getReg(); + Register Reg = MI.getOperand(OpNum).getReg(); // If MI kills this register, the false dependence is already broken. if (MI.killsRegister(Reg, TRI)) return; @@ -4480,7 +4582,7 @@ void X86InstrInfo::breakPartialRegDependency( } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. - unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + Register XReg = TRI->getSubReg(Reg, X86::sub_xmm); BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef) .addReg(XReg, RegState::Undef) @@ -4489,7 +4591,7 @@ void X86InstrInfo::breakPartialRegDependency( } else if (X86::GR64RegClass.contains(Reg)) { // Using XOR32rr because it has shorter encoding and zeros up the upper bits // as well. - unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit); + Register XReg = TRI->getSubReg(Reg, X86::sub_32bit); BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg) .addReg(XReg, RegState::Undef) .addReg(XReg, RegState::Undef) @@ -4538,8 +4640,8 @@ static void updateOperandRegConstraints(MachineFunction &MF, // We only need to update constraints on virtual register operands. if (!MO.isReg()) continue; - unsigned Reg = MO.getReg(); - if (!TRI.isVirtualRegister(Reg)) + Register Reg = MO.getReg(); + if (!Register::isVirtualRegister(Reg)) continue; auto *NewRC = MRI.constrainRegClass( @@ -4698,7 +4800,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { - if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) || + unsigned Ignored; + if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) || !MI.getOperand(1).isReg()) return false; @@ -4788,6 +4891,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (I != nullptr) { unsigned Opcode = I->DstOp; unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0; if (Align < MinAlign) return nullptr; bool NarrowToMOV32rm = false; @@ -4821,8 +4925,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If this is the special case where we use a MOV32rm to load a 32-bit // value and zero-extend the top bits. Change the destination register // to a 32-bit one. - unsigned DstReg = NewMI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + Register DstReg = NewMI->getOperand(0).getReg(); + if (Register::isPhysicalRegister(DstReg)) NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); @@ -5133,6 +5237,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX512_128_SET0: + case X86::FsFLD0F128: + case X86::AVX512_FsFLD0F128: Alignment = 16; break; case X86::MMX_SET0: @@ -5182,7 +5288,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: case X86::FsFLD0SS: - case X86::AVX512_FsFLD0SS: { + case X86::AVX512_FsFLD0SS: + case X86::FsFLD0F128: + case X86::AVX512_FsFLD0F128: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -5212,6 +5320,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Ty = Type::getFloatTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction().getContext()); + else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) + Ty = Type::getFP128Ty(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || @@ -5293,6 +5403,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) { return StoreMMOs; } +static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I, + const TargetRegisterClass *RC, + const X86Subtarget &STI) { + assert(STI.hasAVX512() && "Expected at least AVX512!"); + unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC); + assert((SpillSize == 64 || STI.hasVLX()) && + "Can't broadcast less than 64 bytes without AVX512VL!"); + + switch (I->Flags & TB_BCAST_MASK) { + default: llvm_unreachable("Unexpected broadcast type!"); + case TB_BCAST_D: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTDZ128m; + case 32: return X86::VPBROADCASTDZ256m; + case 64: return X86::VPBROADCASTDZm; + } + break; + case TB_BCAST_Q: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VPBROADCASTQZ128m; + case 32: return X86::VPBROADCASTQZ256m; + case 64: return X86::VPBROADCASTQZm; + } + break; + case TB_BCAST_SS: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VBROADCASTSSZ128m; + case 32: return X86::VBROADCASTSSZ256m; + case 64: return X86::VBROADCASTSSZm; + } + break; + case TB_BCAST_SD: + switch (SpillSize) { + default: llvm_unreachable("Unknown spill size"); + case 16: return X86::VMOVDDUPZ128rm; + case 32: return X86::VBROADCASTSDZ256m; + case 64: return X86::VBROADCASTSDZm; + } + break; + } +} + bool X86InstrInfo::unfoldMemoryOperand( MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const { @@ -5303,6 +5458,7 @@ bool X86InstrInfo::unfoldMemoryOperand( unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; if (UnfoldLoad && !FoldedLoad) return false; UnfoldLoad &= FoldedLoad; @@ -5311,7 +5467,9 @@ bool X86InstrInfo::unfoldMemoryOperand( UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && Subtarget.isUnalignedMem16Slow()) @@ -5335,10 +5493,26 @@ bool X86InstrInfo::unfoldMemoryOperand( AfterOps.push_back(Op); } - // Emit the load instruction. + // Emit the load or broadcast instruction. if (UnfoldLoad) { auto MMOs = extractLoadMMOs(MI.memoperands(), MF); - loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs, NewMIs); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); + } + + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg); + for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) + MIB.add(AddrOps[i]); + MIB.setMemRefs(MMOs); + NewMIs.push_back(MIB); + if (UnfoldStore) { // Address operands cannot be marked isKill. for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { @@ -5404,7 +5578,16 @@ bool X86InstrInfo::unfoldMemoryOperand( if (UnfoldStore) { const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); - storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs, NewMIs); + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); + for (unsigned i = 0, e = AddrOps.size(); i != e; ++i) + MIB.add(AddrOps[i]); + MIB.addReg(Reg, RegState::Kill); + MIB.setMemRefs(MMOs); + NewMIs.push_back(MIB); } return true; @@ -5423,6 +5606,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, unsigned Index = I->Flags & TB_INDEX_MASK; bool FoldedLoad = I->Flags & TB_FOLDED_LOAD; bool FoldedStore = I->Flags & TB_FOLDED_STORE; + bool FoldedBCast = I->Flags & TB_FOLDED_BCAST; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -5456,10 +5640,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, return false; // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. - unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; - Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, - VT, MVT::Other, AddrOps); + + unsigned Opc; + if (FoldedBCast) { + Opc = getBroadcastOpcode(I, RC, Subtarget); + } else { + unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); + bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); + } + + Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. @@ -7367,6 +7558,96 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { } } +Optional<ParamLoadedValue> +X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const { + const MachineOperand *Op = nullptr; + DIExpression *Expr = nullptr; + + switch (MI.getOpcode()) { + case X86::LEA32r: + case X86::LEA64r: + case X86::LEA64_32r: { + // Operand 4 could be global address. For now we do not support + // such situation. + if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm()) + return None; + + const MachineOperand &Op1 = MI.getOperand(1); + const MachineOperand &Op2 = MI.getOperand(3); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister || + Register::isPhysicalRegister(Op2.getReg()))); + + // Omit situations like: + // %rsi = lea %rsi, 4, ... + if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) || + Op2.getReg() == MI.getOperand(0).getReg()) + return None; + else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister && + TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) || + (Op2.getReg() != X86::NoRegister && + TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg()))) + return None; + + int64_t Coef = MI.getOperand(2).getImm(); + int64_t Offset = MI.getOperand(4).getImm(); + SmallVector<uint64_t, 8> Ops; + + if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) { + Op = &Op1; + } else if (Op1.isFI()) + Op = &Op1; + + if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) { + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(Coef + 1); + Ops.push_back(dwarf::DW_OP_mul); + } else { + if (Op && Op2.getReg() != X86::NoRegister) { + int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false); + if (dwarfReg < 0) + return None; + else if (dwarfReg < 32) { + Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg); + Ops.push_back(0); + } else { + Ops.push_back(dwarf::DW_OP_bregx); + Ops.push_back(dwarfReg); + Ops.push_back(0); + } + } else if (!Op) { + assert(Op2.getReg() != X86::NoRegister); + Op = &Op2; + } + + if (Coef > 1) { + assert(Op2.getReg() != X86::NoRegister); + Ops.push_back(dwarf::DW_OP_constu); + Ops.push_back(Coef); + Ops.push_back(dwarf::DW_OP_mul); + } + + if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) && + Op2.getReg() != X86::NoRegister) { + Ops.push_back(dwarf::DW_OP_plus); + } + } + + DIExpression::appendOffset(Ops, Offset); + Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops); + + return ParamLoadedValue(*Op, Expr);; + } + case X86::XOR32rr: { + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) + return ParamLoadedValue(MachineOperand::CreateImm(0), Expr); + return None; + } + default: + return TargetInstrInfo::describeLoadedValue(MI); + } +} + /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, @@ -7500,9 +7781,8 @@ namespace { // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx // addq %rcx, %rax // RAX now holds address of _GLOBAL_OFFSET_TABLE_. - unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); - unsigned GOTReg = - RegInfo.createVirtualRegister(&X86::GR64RegClass); + Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); + Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass); BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg) .addReg(X86::RIP) .addImm(0) diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 13ca17139494..22b7b1d4cb19 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -67,6 +67,9 @@ unsigned getSwappedVPCMPImm(unsigned Imm); /// Get the VPCOM immediate if the opcodes are swapped. unsigned getSwappedVPCOMImm(unsigned Imm); +/// Get the VCMP immediate if the opcodes are swapped. +unsigned getSwappedVCMPImm(unsigned Imm); + } // namespace X86 /// isGlobalStubReference - Return true if the specified TargetFlag operand is @@ -203,7 +206,7 @@ public: int &FrameIndex) const override; bool isReallyTriviallyReMaterializable(const MachineInstr &MI, - AliasAnalysis *AA) const override; + AAResults *AA) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, const MachineInstr &Orig, @@ -218,7 +221,7 @@ public: /// Reference parameters are set to indicate how caller should add this /// operand to the LEA instruction. bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, - unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, + unsigned LEAOpcode, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV) const; @@ -251,7 +254,7 @@ public: /// findCommutedOpIndices(MI, Op1, Op2); /// can be interpreted as a query asking to find an operand that would be /// commutable with the operand#1. - bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, + bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; /// Returns an adjusted FMA opcode that must be used in FMA instruction that @@ -317,23 +320,11 @@ public: const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const; - void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - ArrayRef<MachineMemOperand *> MMOs, - SmallVectorImpl<MachineInstr *> &NewMIs) const; - bool expandPostRAPseudo(MachineInstr &MI) const override; /// Check whether the target can fold a load that feeds a subreg operand @@ -527,6 +518,13 @@ public: #define GET_INSTRINFO_HELPER_DECLS #include "X86GenInstrInfo.inc" + static bool hasLockPrefix(const MachineInstr &MI) { + return MI.getDesc().TSFlags & X86II::LOCK; + } + + Optional<ParamLoadedValue> + describeLoadedValue(const MachineInstr &MI) const override; + protected: /// Commutes the operands in the given instruction by changing the operands /// order and/or changing the instruction's opcode and/or the immediate value diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 8e05dd8ec5c1..e452145f3b65 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -673,6 +673,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { ImmSExti64i32AsmOperand]; } +// 4-bit immediate used by some XOP instructions +// [0, 0xF] +def ImmUnsignedi4AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi4"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidImmUnsignedi4"; +} + // Unsigned immediate used by SSE/AVX instructions // [0, 0xFF] // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] @@ -705,6 +713,13 @@ def i64i8imm : Operand<i64> { let OperandType = "OPERAND_IMMEDIATE"; } +// Unsigned 4-bit immediate used by some XOP instructions. +def u4imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi4AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + // Unsigned 8-bit immediate used by SSE/AVX instructions. def u8imm : Operand<i8> { let PrintMethod = "printU8Imm"; @@ -925,7 +940,6 @@ def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">; def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; -def HasMPX : Predicate<"Subtarget->hasMPX()">; def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">; def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; def HasCLWB : Predicate<"Subtarget->hasCLWB()">; @@ -1103,7 +1117,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ if (ExtType == ISD::NON_EXTLOAD) return true; if (ExtType == ISD::EXTLOAD) - return LD->getAlignment() >= 2 && !LD->isVolatile(); + return LD->getAlignment() >= 2 && LD->isSimple(); return false; }]>; @@ -1113,7 +1127,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ if (ExtType == ISD::NON_EXTLOAD) return true; if (ExtType == ISD::EXTLOAD) - return LD->getAlignment() >= 4 && !LD->isVolatile(); + return LD->getAlignment() >= 4 && LD->isSimple(); return false; }]>; @@ -1170,7 +1184,7 @@ def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [ if (LD->getMemoryVT() == MVT::i32) return true; - return LD->getAlignment() >= 4 && !LD->isVolatile(); + return LD->getAlignment() >= 4 && LD->isSimple(); }]>; @@ -2404,25 +2418,26 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in { } multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, - RegisterClass RC, X86MemOperand x86memop> { + RegisterClass RC, X86MemOperand x86memop, + X86FoldableSchedWrite sched> { let hasSideEffects = 0 in { def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, - T8PS, VEX_4V, Sched<[WriteBLS]>; + T8PS, VEX_4V, Sched<[sched]>; let mayLoad = 1 in def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, - T8PS, VEX_4V, Sched<[WriteBLS.Folded]>; + T8PS, VEX_4V, Sched<[sched.Folded]>; } } let Predicates = [HasBMI], Defs = [EFLAGS] in { - defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>; - defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W; - defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>; - defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W; - defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>; - defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W; + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W; } //===----------------------------------------------------------------------===// @@ -2683,12 +2698,12 @@ def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst", multiclass lwpins_intr<RegisterClass RC> { def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>, + [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>, + [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>, XOP_4V, XOPA; } @@ -2700,11 +2715,11 @@ let Defs = [EFLAGS] in { multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> { def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA; + [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>, + [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>, XOP_4V, XOPA; } @@ -3205,13 +3220,13 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", - (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", - (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", - (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", - (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + (BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">; // clr aliases. def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 57835b1a256a..cd9a866c91cb 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -30,7 +30,6 @@ def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>; let Constraints = "$src1 = $dst" in { // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. - // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, X86FoldableSchedWrite sched, bit Commutable = 0, X86MemOperand OType = i64mem> { @@ -67,7 +66,7 @@ let Constraints = "$src1 = $dst" in { def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), (ins VR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>, + [(set VR64:$dst, (IntId2 VR64:$src1, timm:$src2))]>, Sched<[schedImm]>; } } @@ -114,13 +113,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId, def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 timm:$src3)))]>, Sched<[sched]>; def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, + (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -496,14 +495,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, - (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>, + (int_x86_sse_pshuf_w VR64:$src1, timm:$src2))]>, Sched<[SchedWriteShuffle.MMX]>; def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR64:$dst, (int_x86_sse_pshuf_w (load_mmx addr:$src1), - imm:$src2))]>, + timm:$src2))]>, Sched<[SchedWriteShuffle.MMX.Folded]>; // -- Conversion Instructions @@ -535,7 +534,7 @@ def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, - imm:$src2))]>, + timm:$src2))]>, Sched<[WriteVecExtract]>; let Constraints = "$src1 = $dst" in { let Predicates = [HasMMX, HasSSE1] in { @@ -544,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in { (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, - GR32orGR64:$src2, imm:$src3))]>, + GR32orGR64:$src2, timm:$src3))]>, Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, @@ -553,7 +552,7 @@ let Predicates = [HasMMX, HasSSE1] in { "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, (i32 (anyext (loadi16 addr:$src2))), - imm:$src3))]>, + timm:$src3))]>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; } } @@ -567,6 +566,13 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (int_x86_mmx_pmovmskb VR64:$src))]>, Sched<[WriteMMXMOVMSK]>; +// MMX to XMM for vector types +def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1, + [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>; + +def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)), + (v2i64 (MMX_MOVQ2DQrr VR64:$src))>; + // Low word of XMM to MMX. def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; @@ -574,9 +580,13 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; -def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), +def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))), (x86mmx (MMX_MOVQ64rm addr:$src))>; +def : Pat<(v2i64 (X86vzmovl (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src)))))), + (MMX_MOVQ2DQrr VR64:$src)>; + // Misc. let SchedRW = [SchedWriteShuffle.MMX] in { let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in @@ -602,9 +612,6 @@ def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))), (MMX_CVTTPS2PIirr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q - (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))), - (MMX_CVTTPS2PIirr VR128:$src)>; -def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), (MMX_CVTPD2PIirr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td index f7d931510fe2..44ba071947c2 100644 --- a/lib/Target/X86/X86InstrMPX.td +++ b/lib/Target/X86/X86InstrMPX.td @@ -12,16 +12,16 @@ // //===----------------------------------------------------------------------===// -// FIXME: Investigate a better scheduler class once MPX is used inside LLVM. +// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM. let SchedRW = [WriteSystem] in { multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, - Requires<[HasMPX, Not64BitMode]>; + Requires<[Not64BitMode]>; def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, - Requires<[HasMPX, In64BitMode]>; + Requires<[In64BitMode]>; } defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; @@ -29,17 +29,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, Not64BitMode]>; + Requires<[Not64BitMode]>; def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, In64BitMode]>; + Requires<[In64BitMode]>; def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, Not64BitMode]>; + Requires<[Not64BitMode]>; def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, - Requires<[HasMPX, In64BitMode]>; + Requires<[In64BitMode]>; } defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable; defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable; @@ -47,33 +47,31 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable; def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX]>, NotMemoryFoldable; + NotMemoryFoldable; let mayLoad = 1 in { def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable; + Requires<[Not64BitMode]>, NotMemoryFoldable; def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable; + Requires<[In64BitMode]>, NotMemoryFoldable; } let isCodeGenOnly = 1, ForceDisassemble = 1 in def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX]>, NotMemoryFoldable; + NotMemoryFoldable; let mayStore = 1 in { def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable; + Requires<[Not64BitMode]>, NotMemoryFoldable; def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src), "bndmov\t{$src, $dst|$dst, $src}", []>, PD, - Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable; + Requires<[In64BitMode]>, NotMemoryFoldable; def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src), - "bndstx\t{$src, $dst|$dst, $src}", []>, PS, - Requires<[HasMPX]>; + "bndstx\t{$src, $dst|$dst, $src}", []>, PS; } let mayLoad = 1 in def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), - "bndldx\t{$src, $dst|$dst, $src}", []>, PS, - Requires<[HasMPX]>; + "bndldx\t{$src, $dst|$dst, $src}", []>, PS; } // SchedRW diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 7d0a5b87baf4..09a04c0338b4 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -115,7 +115,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; + [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; + def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; } //===----------------------------------------------------------------------===// @@ -128,13 +130,18 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero] in { + isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4f32 immAllZerosV))]>; } -let Predicates = [NoAVX512] in +let Predicates = [NoAVX512] in { +def : Pat<(v16i8 immAllZerosV), (V_SET0)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>; def : Pat<(v4i32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>; +def : Pat<(v2f64 immAllZerosV), (V_SET0)>; +} // The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, @@ -147,6 +154,14 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllZerosV))]>; } +let Predicates = [NoAVX512] in { +def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; +def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; +def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; +def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; +def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; +} + // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -355,7 +370,7 @@ defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle, SchedWriteFMoveLS.YMM>, PS, VEX, VEX_L, VEX_WIG; -defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", +defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SchedWriteFMoveLS.YMM>, PD, VEX, VEX_L, VEX_WIG; } @@ -661,7 +676,7 @@ let Predicates = [UseSSE1] in { // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1, + def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, (i8 -28)), (MOVLPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), @@ -727,7 +742,7 @@ let Predicates = [UseSSE1] in { // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll // end up with a movsd or blend instead of shufp. // No need for aligned load, we're only loading 64-bits. - def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))), + def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), (MOVHPSrm VR128:$src1, addr:$src2)>; @@ -761,7 +776,7 @@ let Predicates = [UseSSE2] in { let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { // Use MOVLPD to load into the low bits from a full vector unless we can use // BLENDPD. - def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))), + def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; } @@ -1713,12 +1728,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>, Sched<[sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))]>, + (ld_frag addr:$src2), timm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1751,13 +1766,13 @@ multiclass sse12_cmp_scalar_int<Operand memop, def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, imm:$cc))]>, + VR128:$src, timm:$cc))]>, Sched<[sched]>; let mayLoad = 1 in def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src, u8imm:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - mem_cpat:$src, imm:$cc))]>, + mem_cpat:$src, timm:$cc))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1876,12 +1891,12 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, - [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, + [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, Sched<[sched]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, - (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, + (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1906,7 +1921,7 @@ let Constraints = "$src1 = $dst" in { SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; } -def CommutableCMPCC : PatLeaf<(imm), [{ +def CommutableCMPCC : PatLeaf<(timm), [{ uint64_t Imm = N->getZExtValue() & 0x7; return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); }]>; @@ -1915,47 +1930,47 @@ def CommutableCMPCC : PatLeaf<(imm), [{ let Predicates = [HasAVX] in { def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, CommutableCMPCC:$cc)), - (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; + (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, CommutableCMPCC:$cc)), - (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; + (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, CommutableCMPCC:$cc)), - (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; + (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, CommutableCMPCC:$cc)), - (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; + (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; } let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, CommutableCMPCC:$cc)), - (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; + (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; } let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, CommutableCMPCC:$cc)), - (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, CommutableCMPCC:$cc)), - (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; + (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; } //===----------------------------------------------------------------------===// @@ -1970,13 +1985,13 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), - (i8 imm:$src3))))], d>, + (i8 timm:$src3))))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCommutable = IsCommutable in def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], d>, + (i8 timm:$src3))))], d>, Sched<[sched]>; } @@ -2097,7 +2112,7 @@ let Predicates = [HasAVX1Only] in { let Predicates = [UseSSE2] in { // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. def : Pat<(v2f64 (X86Unpckl VR128:$src1, - (v2f64 (nonvolatile_load addr:$src2)))), + (v2f64 (simple_load addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; } @@ -2721,7 +2736,7 @@ defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; - + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -3482,7 +3497,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, Sched<[schedImm]>; } @@ -3514,7 +3529,7 @@ multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>, + [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, Sched<[sched]>; } @@ -3597,7 +3612,7 @@ let Predicates = [HasAVX, prd] in { !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, + (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, VEX, Sched<[sched.XMM]>, VEX_WIG; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), @@ -3605,7 +3620,7 @@ let Predicates = [HasAVX, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (load addr:$src1), - (i8 imm:$src2))))]>, VEX, + (i8 timm:$src2))))]>, VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; } @@ -3615,7 +3630,7 @@ let Predicates = [HasAVX2, prd] in { !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>, + (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, u8imm:$src2), @@ -3623,7 +3638,7 @@ let Predicates = [HasAVX2, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode (load addr:$src1), - (i8 imm:$src2))))]>, VEX, VEX_L, + (i8 timm:$src2))))]>, VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; } @@ -3633,7 +3648,7 @@ let Predicates = [UseSSE2] in { !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, + (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, Sched<[sched.XMM]>; def mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), @@ -3641,7 +3656,7 @@ let Predicates = [UseSSE2] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (memop addr:$src1), - (i8 imm:$src2))))]>, + (i8 timm:$src2))))]>, Sched<[sched.XMM.Folded]>; } } @@ -4380,7 +4395,7 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), + def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; @@ -4388,7 +4403,7 @@ let Predicates = [HasAVX, NoVLX] in { let Predicates = [UseSSE3] in { // No need for aligned memory as this only loads 64-bits. - def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), + def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), (MOVDDUPrm addr:$src)>; def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (MOVDDUPrm addr:$src)>; @@ -4812,7 +4827,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>, + [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, Sched<[sched]>; let mayLoad = 1 in def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), @@ -4823,7 +4838,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, (VT (X86PAlignr RC:$src1, (memop_frag addr:$src2), - (i8 imm:$src3))))]>, + (i8 timm:$src3))))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5300,7 +5315,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>, + (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, Sched<[SchedWriteFShuffle.XMM]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2, u8imm:$src3), @@ -5311,7 +5326,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { [(set VR128:$dst, (X86insertps VR128:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))]>, + timm:$src3))]>, Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; } @@ -5323,17 +5338,6 @@ let ExeDomain = SSEPackedSingle in { defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; } -let Predicates = [UseAVX] in { - // If we're inserting an element from a vbroadcast of a load, fold the - // load into the X86insertps instruction. - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), - (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), - (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), - (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), - (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; -} - //===----------------------------------------------------------------------===// // SSE4.1 - Round Instructions //===----------------------------------------------------------------------===// @@ -5348,7 +5352,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>, + [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, Sched<[sched]>; // Vector intrinsic operation, mem @@ -5357,13 +5361,13 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>, + (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, Sched<[sched.Folded]>; } multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { -let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, @@ -5378,7 +5382,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 -let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { +let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, @@ -5396,7 +5400,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched> { -let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -5411,7 +5415,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { []>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, hasSideEffects = 0 -let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { +let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -5431,7 +5435,7 @@ multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr, X86FoldableSchedWrite sched, ValueType VT32, ValueType VT64, SDNode OpNode, bit Is2Addr = 1> { -let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedSingle in { def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -5439,7 +5443,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, + [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, Sched<[sched]>; def SSm_Int : SS4AIi8<opcss, MRMSrcMem, @@ -5450,11 +5454,11 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 -let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { +let ExeDomain = SSEPackedDouble in { def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -5462,7 +5466,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, + [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, Sched<[sched]>; def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, @@ -5473,7 +5477,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 } @@ -5508,17 +5512,17 @@ let Predicates = [UseAVX] in { } let Predicates = [UseAVX] in { - def : Pat<(X86VRndScale FR32:$src1, imm:$src2), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>; - def : Pat<(X86VRndScale FR64:$src1, imm:$src2), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR32:$src1, timm:$src2), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, timm:$src2), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; } let Predicates = [UseAVX, OptForSize] in { - def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), - (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; - def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), - (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; } let ExeDomain = SSEPackedSingle in @@ -5535,17 +5539,17 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { - def : Pat<(X86VRndScale FR32:$src1, imm:$src2), - (ROUNDSSr FR32:$src1, imm:$src2)>; - def : Pat<(X86VRndScale FR64:$src1, imm:$src2), - (ROUNDSDr FR64:$src1, imm:$src2)>; + def : Pat<(X86VRndScale FR32:$src1, timm:$src2), + (ROUNDSSr FR32:$src1, timm:$src2)>; + def : Pat<(X86VRndScale FR64:$src1, timm:$src2), + (ROUNDSDr FR64:$src1, timm:$src2)>; } let Predicates = [UseSSE41, OptForSize] in { - def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), - (ROUNDSSm addr:$src1, imm:$src2)>; - def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), - (ROUNDSDm addr:$src1, imm:$src2)>; + def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2), + (ROUNDSSm addr:$src1, timm:$src2)>; + def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2), + (ROUNDSDm addr:$src1, timm:$src2)>; } //===----------------------------------------------------------------------===// @@ -5826,7 +5830,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, Sched<[sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), @@ -5836,7 +5840,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>, + (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5853,7 +5857,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, Sched<[sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), @@ -5863,27 +5867,27 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -def BlendCommuteImm2 : SDNodeXForm<imm, [{ +def BlendCommuteImm2 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue() & 0x03; return getI8Imm(Imm ^ 0x03, SDLoc(N)); }]>; -def BlendCommuteImm4 : SDNodeXForm<imm, [{ +def BlendCommuteImm4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue() & 0x0f; return getI8Imm(Imm ^ 0x0f, SDLoc(N)); }]>; -def BlendCommuteImm8 : SDNodeXForm<imm, [{ +def BlendCommuteImm8 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue() & 0xff; return getI8Imm(Imm ^ 0xff, SDLoc(N)); }]>; // Turn a 4-bit blendi immediate to 8-bit for use with pblendw. -def BlendScaleImm4 : SDNodeXForm<imm, [{ +def BlendScaleImm4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 4; ++i) { @@ -5894,7 +5898,7 @@ def BlendScaleImm4 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 8-bit for use with pblendw. -def BlendScaleImm2 : SDNodeXForm<imm, [{ +def BlendScaleImm2 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -5905,7 +5909,7 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 4-bit for use with pblendd. -def BlendScaleImm2to4 : SDNodeXForm<imm, [{ +def BlendScaleImm2to4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -5916,7 +5920,7 @@ def BlendScaleImm2to4 : SDNodeXForm<imm, [{ }]>; // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. -def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{ +def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 4; ++i) { @@ -5927,7 +5931,7 @@ def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. -def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{ +def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -5938,7 +5942,7 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{ }]>; // Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. -def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{ +def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); uint8_t NewImm = 0; for (unsigned i = 0; i != 2; ++i) { @@ -6008,7 +6012,7 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, Sched<[sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), @@ -6018,14 +6022,14 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, - (commuteXForm imm:$src3))>; + (commuteXForm timm:$src3))>; } let Predicates = [HasAVX] in { @@ -6061,37 +6065,37 @@ let Predicates = [HasAVX2] in { // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. // ExecutionDomainFixPass will cleanup domains later on. let Predicates = [HasAVX1Only] in { -def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), - (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), - (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), - (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), + (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), + (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; // Use pblendw for 128-bit integer to keep it in the integer domain and prevent // it from becoming movsd via commuting under optsize. -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; - -def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), - (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; -def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), - (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; -def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), - (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; + +def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), + (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; +def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; +def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), + (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; // Use pblendw for 128-bit integer to keep it in the integer domain and prevent // it from becoming movss via commuting under optsize. -def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), - (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), - (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), + (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), + (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; } defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, @@ -6107,19 +6111,19 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, let Predicates = [UseSSE41] in { // Use pblendw for 128-bit integer to keep it in the integer domain and prevent // it from becoming movss via commuting under optsize. -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; -def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; +def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; -def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), - (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), - (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), + (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), + (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; } // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -6592,7 +6596,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, - (i8 imm:$src3)))]>, TA, + (i8 timm:$src3)))]>, TA, Sched<[SchedWriteVecIMul.XMM]>; def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), @@ -6600,7 +6604,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, (memop addr:$src2), - (i8 imm:$src3)))]>, TA, + (i8 timm:$src3)))]>, TA, Sched<[SchedWriteVecIMul.XMM.Folded, SchedWriteVecIMul.XMM.ReadAfterFold]>; @@ -6718,26 +6722,26 @@ let Predicates = [HasAVX, HasAES] in { (ins VR128:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; } def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, Sched<[WriteAESKeyGen]>; def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>, + (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, Sched<[WriteAESKeyGen.Folded]>; //===----------------------------------------------------------------------===// @@ -6745,7 +6749,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), //===----------------------------------------------------------------------===// // Immediate transform to help with commuting. -def PCLMULCommuteImm : SDNodeXForm<imm, [{ +def PCLMULCommuteImm : SDNodeXForm<timm, [{ uint8_t Imm = N->getZExtValue(); return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); }]>; @@ -6758,7 +6762,7 @@ let Predicates = [NoAVX, HasPCLMUL] in { (ins VR128:$src1, VR128:$src2, u8imm:$src3), "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, Sched<[WriteCLMul]>; def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), @@ -6766,14 +6770,14 @@ let Predicates = [NoAVX, HasPCLMUL] in { "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), - imm:$src3))]>, + timm:$src3))]>, Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; } // Constraints = "$src1 = $dst" def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, - (i8 imm:$src3)), + (i8 timm:$src3)), (PCLMULQDQrm VR128:$src1, addr:$src2, - (PCLMULCommuteImm imm:$src3))>; + (PCLMULCommuteImm timm:$src3))>; } // Predicates = [NoAVX, HasPCLMUL] // SSE aliases @@ -6795,21 +6799,21 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, (ins RC:$src1, RC:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set RC:$dst, - (IntId RC:$src1, RC:$src2, imm:$src3))]>, + (IntId RC:$src1, RC:$src2, timm:$src3))]>, Sched<[WriteCLMul]>; def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, MemOp:$src2, u8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set RC:$dst, - (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, + (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; // We can commute a load in the first operand by swapping the sources and // rotating the immediate. - def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), + def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, - (PCLMULCommuteImm imm:$src3))>; + (PCLMULCommuteImm timm:$src3))>; } let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in @@ -6853,8 +6857,8 @@ let Constraints = "$src = $dst" in { def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", - [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, - imm:$idx))]>, + [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, + timm:$idx))]>, PD, Sched<[SchedWriteVecALU.XMM]>; def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), @@ -6867,7 +6871,7 @@ def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, - imm:$len, imm:$idx))]>, + timm:$len, timm:$idx))]>, XD, Sched<[SchedWriteVecALU.XMM]>; def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), @@ -6907,10 +6911,10 @@ def : Pat<(nontemporalstore FR64:$src, addr:$dst), // class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, - PatFrag ld_frag, SchedWrite Sched> : + PatFrag bcast_frag, SchedWrite Sched> : AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, Sched<[Sched]>, VEX; // AVX2 adds register forms @@ -6923,15 +6927,15 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, - f32mem, v4f32, loadf32, + f32mem, v4f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>; def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, - f32mem, v8f32, loadf32, + f32mem, v8f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>, VEX_L; } let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, - v4f64, loadf64, + v4f64, X86VBroadcastld64, SchedWriteFShuffle.XMM.Folded>, VEX_L; let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { @@ -6944,15 +6948,6 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, v4f64, v2f64, WriteFShuffle256>, VEX_L; -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (VBROADCASTSSrm addr:$src)>; - def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), - (VBROADCASTSSYrm addr:$src)>; - def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), - (VBROADCASTSDYrm addr:$src)>; -} - //===----------------------------------------------------------------------===// // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both // halves of a 256-bit vector. @@ -7081,27 +7076,29 @@ let Predicates = [HasAVX1Only] in { // multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, Intrinsic IntLd, Intrinsic IntLd256, - Intrinsic IntSt, Intrinsic IntSt256> { + Intrinsic IntSt, Intrinsic IntSt256, + X86SchedWriteMaskMove schedX, + X86SchedWriteMaskMove schedY> { def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, - VEX_4V, Sched<[WriteFMaskedLoad]>; + VEX_4V, Sched<[schedX.RM]>; def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, - VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; + VEX_4V, VEX_L, Sched<[schedY.RM]>; def mr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, - VEX_4V, Sched<[WriteFMaskedStore]>; + VEX_4V, Sched<[schedX.MR]>; def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, - VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; + VEX_4V, VEX_L, Sched<[schedY.MR]>; } let ExeDomain = SSEPackedSingle in @@ -7109,13 +7106,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", int_x86_avx_maskload_ps, int_x86_avx_maskload_ps_256, int_x86_avx_maskstore_ps, - int_x86_avx_maskstore_ps_256>; + int_x86_avx_maskstore_ps_256, + WriteFMaskMove32, WriteFMaskMove32Y>; let ExeDomain = SSEPackedDouble in defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", int_x86_avx_maskload_pd, int_x86_avx_maskload_pd_256, int_x86_avx_maskstore_pd, - int_x86_avx_maskstore_pd_256>; + int_x86_avx_maskstore_pd_256, + WriteFMaskMove64, WriteFMaskMove64Y>; //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values @@ -7143,13 +7142,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, + [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, Sched<[sched]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, + (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, Sched<[sched.Folded]>; }// Predicates = [HasAVX, NoVLX] } @@ -7181,38 +7180,38 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, VEX_4V, VEX_L, + (i8 timm:$src3))))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), - (i8 imm:$src3)))]>, VEX_4V, VEX_L, + (i8 timm:$src3)))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; } // Immediate transform to help with commuting. -def Perm2XCommuteImm : SDNodeXForm<imm, [{ +def Perm2XCommuteImm : SDNodeXForm<timm, [{ return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); }]>; let Predicates = [HasAVX] in { // Pattern with load in other operand. def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), - VR256:$src1, (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + VR256:$src1, (i8 timm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, - (loadv4i64 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; + (loadv4i64 addr:$src2), (i8 timm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>; // Pattern with load in other operand. def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), - VR256:$src1, (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + VR256:$src1, (i8 timm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; } //===----------------------------------------------------------------------===// @@ -7257,7 +7256,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, + [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>, TAPD, VEX, Sched<[RR]>; let hasSideEffects = 0, mayStore = 1 in def mr : Ii8<0x1D, MRMDestMem, (outs), @@ -7282,15 +7281,15 @@ let Predicates = [HasF16C, NoVLX] in { (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), - (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; + (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -7327,20 +7326,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins RC:$src1, RC:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, Sched<[sched]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>, + (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; // Pattern to commute if load is in first source. - def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)), + def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, - (commuteXForm imm:$src3))>; + (commuteXForm timm:$src3))>; } let Predicates = [HasAVX2] in { @@ -7351,19 +7350,19 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, SchedWriteBlend.YMM, VR256, i256mem, BlendCommuteImm8>, VEX_L; -def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), - (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), - (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; -def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), - (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; +def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), + (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; +def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), + (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; -def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), - (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>; -def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), - (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>; -def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), - (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>; +def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), + (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; +def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; +def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), + (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; } // For insertion into the zero index (low half) of a 256-bit vector, it is @@ -7407,7 +7406,7 @@ def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0 // destination operand // multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, + X86MemOperand x86memop, PatFrag bcast_frag, ValueType OpVT128, ValueType OpVT256, Predicate prd> { let Predicates = [HasAVX2, prd] in { def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -7418,7 +7417,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, + (OpVT128 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -7428,7 +7427,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, + (OpVT256 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; // Provide aliases for broadcast from the same register class that @@ -7439,13 +7438,13 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, } } -defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, v16i8, v32i8, NoVLX_Or_NoBWI>; -defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, v8i16, v16i16, NoVLX_Or_NoBWI>; -defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, v4i32, v8i32, NoVLX>; -defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2, NoVLX] in { @@ -7455,14 +7454,11 @@ let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + // FIXME this is to handle aligned extloads from i8/i16. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), - (VPBROADCASTQYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -7483,17 +7479,12 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; -} -let Predicates = [HasAVX2, NoVLX] in { - // Provide aliases for broadcast from the same register class that - // automatically does the extract. - def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), - (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), - sub_xmm)))>; - def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), - (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), - sub_xmm)))>; + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX] in { @@ -7509,45 +7500,41 @@ let Predicates = [HasAVX2, NoVLX] in { let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS + (VPBROADCASTBrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit)), - VR128)))>; + GR8:$src, sub_8bit))))>; def : Pat<(v32i8 (X86VBroadcast GR8:$src)), - (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS + (VPBROADCASTBYrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR8:$src, sub_8bit)), - VR128)))>; + GR8:$src, sub_8bit))))>; def : Pat<(v8i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS + (VPBROADCASTWrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit)), - VR128)))>; + GR16:$src, sub_16bit))))>; def : Pat<(v16i16 (X86VBroadcast GR16:$src)), - (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS + (VPBROADCASTWYrr (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), - GR16:$src, sub_16bit)), - VR128)))>; + GR16:$src, sub_16bit))))>; } let Predicates = [HasAVX2, NoVLX] in { def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; + (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), - (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; + (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; def : Pat<(v2i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; + (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), - (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; + (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; } // AVX1 broadcast patterns let Predicates = [HasAVX1Only] in { -def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), +def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSrm addr:$src)>; } @@ -7557,12 +7544,12 @@ let Predicates = [HasAVX, NoVLX] in { // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; - def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), (VMOVDDUPrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), (VMOVDDUPrm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), (VMOVDDUPrm addr:$src)>; @@ -7581,19 +7568,19 @@ let Predicates = [HasAVX1Only] in { (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; def : Pat<(v4i32 (X86VBroadcast GR32:$src)), - (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; + (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; def : Pat<(v8i32 (X86VBroadcast GR32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; + (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), + (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), - (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; + (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), + (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; def : Pat<(v2i64 (X86VBroadcast i64:$src)), - (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; + def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; } @@ -7636,7 +7623,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, - (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, + (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins memOp:$src1, u8imm:$src2), @@ -7644,7 +7631,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), - (i8 imm:$src2))))]>, + (i8 timm:$src2))))]>, Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; } } @@ -7663,19 +7650,19 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, - (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, + (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), - (i8 imm:$src3)))]>, + (i8 timm:$src3)))]>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; let Predicates = [HasAVX2] in def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), - VR256:$src1, (i8 imm:$imm))), - (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + VR256:$src1, (i8 timm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; //===----------------------------------------------------------------------===// @@ -7760,7 +7747,7 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q_256>, VEX_W; multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, - ValueType MaskVT, string BlendStr, ValueType ZeroVT> { + ValueType MaskVT> { // masked store def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; @@ -7772,23 +7759,23 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; } let Predicates = [HasAVX] in { - defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; - defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; - defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; - defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; } let Predicates = [HasAVX1Only] in { // load/store i32/i64 not supported use ps/pd version - defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; - defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; - defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; - defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; } let Predicates = [HasAVX2] in { - defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; - defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; - defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; - defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; + defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; + defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; + defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; + defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; } //===----------------------------------------------------------------------===// @@ -7956,13 +7943,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), "", - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", [(set RC:$dst, (OpVT (OpNode RC:$src1, (MemOpFrag addr:$src2), - imm:$src3)))], SSEPackedInt>, + timm:$src3)))], SSEPackedInt>, Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; } } diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 7050e1917494..7f41feb6c0d9 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -43,7 +43,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; let SchedRW = [WriteSystem] in { def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", - [(int_x86_int imm:$trap)]>; + [(int_x86_int timm:$trap)]>; def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index fc0da845299f..3a1212342a13 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -45,7 +45,7 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins), def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", - [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; + [(int_x86_xabort timm:$imm)]>, Requires<[HasRTM]>; } // SchedRW // HLE prefixes diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 66ca78556b82..229af366d940 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -143,13 +143,13 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, + (vt128 (OpNode (vt128 VR128:$src1), timm:$src2)))]>, XOP, Sched<[sched]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>, + (vt128 (OpNode (vt128 (load addr:$src1)), timm:$src2)))]>, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -251,7 +251,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128, "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), - imm:$cc)))]>, + timm:$cc)))]>, XOP_4V, Sched<[sched]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$cc), @@ -260,14 +260,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128, [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)), - imm:$cc)))]>, + timm:$cc)))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } def : Pat<(OpNode (load addr:$src2), - (vt128 VR128:$src1), imm:$cc), + (vt128 VR128:$src1), timm:$cc), (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2, - (CommuteVPCOMCC imm:$cc))>; + (CommuteVPCOMCC timm:$cc))>; } defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>; @@ -418,27 +418,27 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag, X86FoldableSchedWrite sched> { def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), + (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, - (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>, + (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 timm:$src4))))]>, Sched<[sched]>; def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst), - (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4), + (ins RC:$src1, RC:$src2, intmemop:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3), - (i8 imm:$src4))))]>, VEX_W, + (i8 timm:$src4))))]>, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4), + (ins RC:$src1, fpmemop:$src2, RC:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2), - RC:$src3, (i8 imm:$src4))))]>, + RC:$src3, (i8 timm:$src4))))]>, Sched<[sched.Folded, sched.ReadAfterFold, // fpmemop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -447,7 +447,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), + (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>; diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 892a083f4d1a..01620b7b64c9 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -60,7 +60,7 @@ public: X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI, const X86RegisterBankInfo &RBI); - bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + bool select(MachineInstr &I) override; static const char *getName() { return DEBUG_TYPE; } private: @@ -94,11 +94,9 @@ private: MachineFunction &MF) const; bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const; + MachineFunction &MF); bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const; + MachineFunction &MF); bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI, @@ -217,7 +215,7 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) { } static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) { - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(Register::isPhysicalRegister(Reg)); if (X86::GR64RegClass.contains(Reg)) return &X86::GR64RegClass; if (X86::GR32RegClass.contains(Reg)) @@ -233,15 +231,15 @@ static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) { // Set X86 Opcode and constrain DestReg. bool X86InstructionSelector::selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const { - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); - unsigned SrcReg = I.getOperand(1).getReg(); + Register SrcReg = I.getOperand(1).getReg(); const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { + if (Register::isPhysicalRegister(DstReg)) { assert(I.isCopy() && "Generic operators do not allow physical registers"); if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID && @@ -253,7 +251,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, if (SrcRC != DstRC) { // This case can be generated by ABI lowering, performe anyext - unsigned ExtSrc = MRI.createVirtualRegister(DstRC); + Register ExtSrc = MRI.createVirtualRegister(DstRC); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG)) .addDef(ExtSrc) @@ -268,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, return true; } - assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && + assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) && "No phys reg on generic operators"); assert((DstSize == SrcSize || // Copies are a mean to setup initial types, the number of // bits may not exactly match. - (TargetRegisterInfo::isPhysicalRegister(SrcReg) && + (Register::isPhysicalRegister(SrcReg) && DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) && "Copy with different width?!"); @@ -282,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, if (SrcRegBank.getID() == X86::GPRRegBankID && DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize && - TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + Register::isPhysicalRegister(SrcReg)) { // Change the physical register to performe truncate. const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg); @@ -308,8 +306,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, return true; } -bool X86InstructionSelector::select(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +bool X86InstructionSelector::select(MachineInstr &I) { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -333,7 +330,7 @@ bool X86InstructionSelector::select(MachineInstr &I, assert(I.getNumOperands() == I.getNumExplicitOperands() && "Generic instruction has unexpected implicit operands\n"); - if (selectImpl(I, CoverageInfo)) + if (selectImpl(I, *CoverageInfo)) return true; LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs())); @@ -370,10 +367,10 @@ bool X86InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_UADDE: return selectUadde(I, MRI, MF); case TargetOpcode::G_UNMERGE_VALUES: - return selectUnmergeValues(I, MRI, MF, CoverageInfo); + return selectUnmergeValues(I, MRI, MF); case TargetOpcode::G_MERGE_VALUES: case TargetOpcode::G_CONCAT_VECTORS: - return selectMergeValues(I, MRI, MF, CoverageInfo); + return selectMergeValues(I, MRI, MF); case TargetOpcode::G_EXTRACT: return selectExtract(I, MRI, MF); case TargetOpcode::G_INSERT: @@ -512,7 +509,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) && "unexpected instruction"); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); @@ -572,7 +569,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) && "unexpected instruction"); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); // Use LEA to calculate frame index and GEP @@ -625,7 +622,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, AM.Base.Reg = X86::RIP; } - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); unsigned NewOpc = getLeaOP(Ty, STI); @@ -644,7 +641,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I, assert((I.getOpcode() == TargetOpcode::G_CONSTANT) && "unexpected instruction"); - const unsigned DefReg = I.getOperand(0).getReg(); + const Register DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID) @@ -717,8 +714,8 @@ bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I, I.getOpcode() == TargetOpcode::G_PTRTOINT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); @@ -781,8 +778,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); @@ -892,8 +889,8 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); @@ -952,8 +949,8 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, std::tie(CC, SwapArgs) = X86::getX86ConditionCode( (CmpInst::Predicate)I.getOperand(1).getPredicate()); - unsigned LHS = I.getOperand(2).getReg(); - unsigned RHS = I.getOperand(3).getReg(); + Register LHS = I.getOperand(2).getReg(); + Register RHS = I.getOperand(3).getReg(); if (SwapArgs) std::swap(LHS, RHS); @@ -998,8 +995,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_FCMP) && "unexpected instruction"); - unsigned LhsReg = I.getOperand(2).getReg(); - unsigned RhsReg = I.getOperand(3).getReg(); + Register LhsReg = I.getOperand(2).getReg(); + Register RhsReg = I.getOperand(3).getReg(); CmpInst::Predicate Predicate = (CmpInst::Predicate)I.getOperand(1).getPredicate(); @@ -1033,7 +1030,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, break; } - unsigned ResultReg = I.getOperand(0).getReg(); + Register ResultReg = I.getOperand(0).getReg(); RBI.constrainGenericRegister( ResultReg, *getRegClass(LLT::scalar(8), *RBI.getRegBank(ResultReg, MRI, TRI)), MRI); @@ -1043,8 +1040,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I, .addReg(LhsReg) .addReg(RhsReg); - unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass); - unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass); + Register FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass); + Register FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass); MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]); MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(), @@ -1089,11 +1086,11 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned CarryOutReg = I.getOperand(1).getReg(); - const unsigned Op0Reg = I.getOperand(2).getReg(); - const unsigned Op1Reg = I.getOperand(3).getReg(); - unsigned CarryInReg = I.getOperand(4).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register CarryOutReg = I.getOperand(1).getReg(); + const Register Op0Reg = I.getOperand(2).getReg(); + const Register Op1Reg = I.getOperand(3).getReg(); + Register CarryInReg = I.getOperand(4).getReg(); const LLT DstTy = MRI.getType(DstReg); @@ -1149,8 +1146,8 @@ bool X86InstructionSelector::selectExtract(MachineInstr &I, assert((I.getOpcode() == TargetOpcode::G_EXTRACT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); int64_t Index = I.getOperand(2).getImm(); const LLT DstTy = MRI.getType(DstReg); @@ -1281,9 +1278,9 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned SrcReg = I.getOperand(1).getReg(); - const unsigned InsertReg = I.getOperand(2).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register SrcReg = I.getOperand(1).getReg(); + const Register InsertReg = I.getOperand(2).getReg(); int64_t Index = I.getOperand(3).getImm(); const LLT DstTy = MRI.getType(DstReg); @@ -1335,14 +1332,13 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, } bool X86InstructionSelector::selectUnmergeValues( - MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const { + MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) { assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) && "unexpected instruction"); // Split to extracts. unsigned NumDefs = I.getNumOperands() - 1; - unsigned SrcReg = I.getOperand(NumDefs).getReg(); + Register SrcReg = I.getOperand(NumDefs).getReg(); unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); for (unsigned Idx = 0; Idx < NumDefs; ++Idx) { @@ -1352,7 +1348,7 @@ bool X86InstructionSelector::selectUnmergeValues( .addReg(SrcReg) .addImm(Idx * DefSize); - if (!select(ExtrInst, CoverageInfo)) + if (!select(ExtrInst)) return false; } @@ -1361,15 +1357,14 @@ bool X86InstructionSelector::selectUnmergeValues( } bool X86InstructionSelector::selectMergeValues( - MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF, - CodeGenCoverage &CoverageInfo) const { + MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) { assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES || I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) && "unexpected instruction"); // Split to inserts. - unsigned DstReg = I.getOperand(0).getReg(); - unsigned SrcReg0 = I.getOperand(1).getReg(); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg0 = I.getOperand(1).getReg(); const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg0); @@ -1378,13 +1373,13 @@ bool X86InstructionSelector::selectMergeValues( const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); // For the first src use insertSubReg. - unsigned DefReg = MRI.createGenericVirtualRegister(DstTy); + Register DefReg = MRI.createGenericVirtualRegister(DstTy); MRI.setRegBank(DefReg, RegBank); if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF)) return false; for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) { - unsigned Tmp = MRI.createGenericVirtualRegister(DstTy); + Register Tmp = MRI.createGenericVirtualRegister(DstTy); MRI.setRegBank(Tmp, RegBank); MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), @@ -1395,7 +1390,7 @@ bool X86InstructionSelector::selectMergeValues( DefReg = Tmp; - if (!select(InsertInst, CoverageInfo)) + if (!select(InsertInst)) return false; } @@ -1403,7 +1398,7 @@ bool X86InstructionSelector::selectMergeValues( TII.get(TargetOpcode::COPY), DstReg) .addReg(DefReg); - if (!select(CopyInst, CoverageInfo)) + if (!select(CopyInst)) return false; I.eraseFromParent(); @@ -1415,7 +1410,7 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I, MachineFunction &MF) const { assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction"); - const unsigned CondReg = I.getOperand(0).getReg(); + const Register CondReg = I.getOperand(0).getReg(); MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); MachineInstr &TestInst = @@ -1442,7 +1437,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, if (CM != CodeModel::Small && CM != CodeModel::Large) return false; - const unsigned DstReg = I.getOperand(0).getReg(); + const Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); unsigned Align = DstTy.getSizeInBits(); @@ -1460,7 +1455,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, // Under X86-64 non-small code model, GV (and friends) are 64-bits, so // they cannot be folded into immediate fields. - unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass); + Register AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass); BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg) .addConstantPoolIndex(CPI, 0, OpFlag); @@ -1503,7 +1498,7 @@ bool X86InstructionSelector::selectImplicitDefOrPHI( I.getOpcode() == TargetOpcode::G_PHI) && "unexpected instruction"); - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); if (!MRI.getRegClassOrNull(DstReg)) { const LLT DstTy = MRI.getType(DstReg); @@ -1537,7 +1532,7 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, I.getOpcode() == TargetOpcode::G_LSHR) && "unexpected instruction"); - unsigned DstReg = I.getOperand(0).getReg(); + Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); @@ -1578,8 +1573,8 @@ bool X86InstructionSelector::selectShift(MachineInstr &I, return false; } - unsigned Op0Reg = I.getOperand(1).getReg(); - unsigned Op1Reg = I.getOperand(2).getReg(); + Register Op0Reg = I.getOperand(1).getReg(); + Register Op1Reg = I.getOperand(2).getReg(); assert(MRI.getType(Op1Reg).getSizeInBits() == 8); @@ -1606,9 +1601,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, I.getOpcode() == TargetOpcode::G_UREM) && "unexpected instruction"); - const unsigned DstReg = I.getOperand(0).getReg(); - const unsigned Op1Reg = I.getOperand(1).getReg(); - const unsigned Op2Reg = I.getOperand(2).getReg(); + const Register DstReg = I.getOperand(0).getReg(); + const Register Op1Reg = I.getOperand(1).getReg(); + const Register Op2Reg = I.getOperand(2).getReg(); const LLT RegTy = MRI.getType(DstReg); assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) && @@ -1732,7 +1727,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpSignExtend)); else { - unsigned Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass); + Register Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::MOV32r0), Zero32); @@ -1770,8 +1765,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I, if ((I.getOpcode() == Instruction::SRem || I.getOpcode() == Instruction::URem) && OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) { - unsigned SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); - unsigned ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); + Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); + Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg) .addReg(X86::AX); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 40141d894629..1d7adbaa9e99 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, - CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI, CVTPD2PS_MASK, INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE, INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE, @@ -1101,8 +1101,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), - X86_INTRINSIC_DATA(tbm_bextri_u32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), - X86_INTRINSIC_DATA(tbm_bextri_u64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), + X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0), + X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 00fb1b573858..04121f863c89 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -13,6 +13,7 @@ #include "X86LegalizerInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" +#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" @@ -84,6 +85,24 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, verify(*STI.getInstrInfo()); } +bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const { + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memset: + case Intrinsic::memmove: + if (createMemLibcall(MIRBuilder, MRI, MI) == + LegalizerHelper::UnableToLegalize) + return false; + MI.eraseFromParent(); + return true; + default: + break; + } + return true; +} + void X86LegalizerInfo::setLegalizerInfo32bit() { const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0)); @@ -158,6 +177,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_ANYEXT, Ty}, Legal); } setAction({G_ANYEXT, s128}, Legal); + getActionDefinitionsBuilder(G_SEXT_INREG).lower(); // Comparison setAction({G_ICMP, s1}, Legal); diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h index d21707b9ab9b..7a0f13fb5ae6 100644 --- a/lib/Target/X86/X86LegalizerInfo.h +++ b/lib/Target/X86/X86LegalizerInfo.h @@ -32,6 +32,9 @@ private: public: X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM); + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &MIRBuilder) const override; + private: void setLegalizerInfo32bit(); void setLegalizerInfo64bit(); diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index b1fefaa84be4..78098fd6262f 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -427,6 +427,41 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, } } +// Replace TAILJMP opcodes with their equivalent opcodes that have encoding +// information. +static unsigned convertTailJumpOpcode(unsigned Opcode) { + switch (Opcode) { + case X86::TAILJMPr: + Opcode = X86::JMP32r; + break; + case X86::TAILJMPm: + Opcode = X86::JMP32m; + break; + case X86::TAILJMPr64: + Opcode = X86::JMP64r; + break; + case X86::TAILJMPm64: + Opcode = X86::JMP64m; + break; + case X86::TAILJMPr64_REX: + Opcode = X86::JMP64r_REX; + break; + case X86::TAILJMPm64_REX: + Opcode = X86::JMP64m_REX; + break; + case X86::TAILJMPd: + case X86::TAILJMPd64: + Opcode = X86::JMP_1; + break; + case X86::TAILJMPd_CC: + case X86::TAILJMPd64_CC: + Opcode = X86::JCC_1; + break; + } + + return Opcode; +} + void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); @@ -500,21 +535,190 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; } - // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register - // inputs modeled as normal uses instead of implicit uses. As such, truncate - // off all but the first operand (the callee). FIXME: Change isel. - case X86::TAILJMPr64: - case X86::TAILJMPr64_REX: - case X86::CALL64r: - case X86::CALL64pcrel32: { - unsigned Opcode = OutMI.getOpcode(); - MCOperand Saved = OutMI.getOperand(0); - OutMI = MCInst(); - OutMI.setOpcode(Opcode); - OutMI.addOperand(Saved); + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik: + case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik: + case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmi: case X86::VPCMPBZrmik: + case X86::VPCMPBZrri: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmi: case X86::VPCMPDZrmik: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPDZrri: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmi: case X86::VPCMPQZrmik: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPQZrri: case X86::VPCMPQZrrik: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik: + case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik: + case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmi: case X86::VPCMPWZrmik: + case X86::VPCMPWZrri: case X86::VPCMPWZrrik: { + // Turn immediate 0 into the VPCMPEQ instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + + // Turn immediate 6 into the VPCMPGT instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + break; } + // CALL64r, CALL64pcrel32 - These instructions used to have + // register inputs modeled as normal uses instead of implicit uses. As such, + // they we used to truncate off all but the first operand (the callee). This + // issue seems to have been fixed at some point. This assert verifies that. + case X86::CALL64r: + case X86::CALL64pcrel32: + assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!"); + break; + case X86::EH_RETURN: case X86::EH_RETURN64: { OutMI = MCInst(); @@ -539,36 +743,30 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; } - // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump - // instruction. - { - unsigned Opcode; - case X86::TAILJMPr: - Opcode = X86::JMP32r; - goto SetTailJmpOpcode; - case X86::TAILJMPd: - case X86::TAILJMPd64: - Opcode = X86::JMP_1; - goto SetTailJmpOpcode; - - SetTailJmpOpcode: - MCOperand Saved = OutMI.getOperand(0); - OutMI = MCInst(); - OutMI.setOpcode(Opcode); - OutMI.addOperand(Saved); - break; - } + // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump + // instruction. + case X86::TAILJMPr: + case X86::TAILJMPr64: + case X86::TAILJMPr64_REX: + case X86::TAILJMPd: + case X86::TAILJMPd64: + assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!"); + OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode())); + break; case X86::TAILJMPd_CC: - case X86::TAILJMPd64_CC: { - MCOperand Saved = OutMI.getOperand(0); - MCOperand Saved2 = OutMI.getOperand(1); - OutMI = MCInst(); - OutMI.setOpcode(X86::JCC_1); - OutMI.addOperand(Saved); - OutMI.addOperand(Saved2); + case X86::TAILJMPd64_CC: + assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!"); + OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode())); + break; + + case X86::TAILJMPm: + case X86::TAILJMPm64: + case X86::TAILJMPm64_REX: + assert(OutMI.getNumOperands() == X86::AddrNumOperands && + "Unexpected number of operands!"); + OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode())); break; - } case X86::DEC16r: case X86::DEC32r: @@ -958,7 +1156,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>, // <opcode>, <operands> - unsigned DefRegister = FaultingMI.getOperand(0).getReg(); + Register DefRegister = FaultingMI.getOperand(0).getReg(); FaultMaps::FaultKind FK = static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm()); MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol(); @@ -1079,7 +1277,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, // Emit MOV to materialize the target address and the CALL to target. // This is encoded with 12-13 bytes, depending on which register is used. - unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg(); + Register ScratchReg = MI.getOperand(ScratchIdx).getReg(); if (X86II::isX86_64ExtendedReg(ScratchReg)) EncodedBytes = 13; else @@ -1369,6 +1567,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, recordSled(CurSled, MI, SledKind::TAIL_CALL); unsigned OpCode = MI.getOperand(0).getImm(); + OpCode = convertTailJumpOpcode(OpCode); MCInst TC; TC.setOpcode(OpCode); @@ -1538,8 +1737,6 @@ static void printConstant(const Constant *COp, raw_ostream &CS) { void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only"); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); // Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86. if (EmitFPOData) { @@ -1577,17 +1774,16 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { // Otherwise, use the .seh_ directives for all other Windows platforms. switch (MI->getOpcode()) { case X86::SEH_PushReg: - OutStreamer->EmitWinCFIPushReg( - RI->getSEHRegNum(MI->getOperand(0).getImm())); + OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm()); break; case X86::SEH_SaveReg: - OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; case X86::SEH_SaveXMM: - OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(), MI->getOperand(1).getImm()); break; @@ -1596,9 +1792,8 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { break; case X86::SEH_SetFrame: - OutStreamer->EmitWinCFISetFrame( - RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); + OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(), + MI->getOperand(1).getImm()); break; case X86::SEH_PushFrame: @@ -1650,7 +1845,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::EH_RETURN: case X86::EH_RETURN64: { // Lower these as normal, but add some comments. - unsigned Reg = MI->getOperand(0).getReg(); + Register Reg = MI->getOperand(0).getReg(); OutStreamer->AddComment(StringRef("eh_return, addr: %") + X86ATTInstPrinter::getRegisterName(Reg)); break; @@ -1697,11 +1892,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::MASKPAIR16LOAD: { int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); - unsigned Reg = MI->getOperand(0).getReg(); - unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); - unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + Register Reg = MI->getOperand(0).getReg(); + Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); // Load the first mask register MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm); @@ -1730,11 +1923,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::MASKPAIR16STORE: { int64_t Disp = MI->getOperand(X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); - unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg(); - unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); - unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); + Register Reg = MI->getOperand(X86::AddrNumOperands).getReg(); + Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); + Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); // Store the first mask register MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk); diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index d7e535598d81..5cb80a082b56 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// is stashed. signed char RestoreBasePointerOffset = 0; + /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame + /// in bytes. + DenseMap<int, unsigned> WinEHXMMSlotInfo; + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. unsigned CalleeSavedFrameSize = 0; @@ -120,6 +124,10 @@ public: void setRestoreBasePointer(const MachineFunction *MF); int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } + DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; } + const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const { + return WinEHXMMSlotInfo; } + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp index 7f75598b0655..1aee01563c4b 100644 --- a/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -198,8 +198,7 @@ static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) { static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2) { return MO1.isIdenticalTo(MO2) && - (!MO1.isReg() || - !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); + (!MO1.isReg() || !Register::isPhysicalRegister(MO1.getReg())); } #ifndef NDEBUG @@ -235,9 +234,9 @@ static inline bool isLEA(const MachineInstr &MI) { namespace { -class OptimizeLEAPass : public MachineFunctionPass { +class X86OptimizeLEAPass : public MachineFunctionPass { public: - OptimizeLEAPass() : MachineFunctionPass(ID) {} + X86OptimizeLEAPass() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "X86 LEA Optimize"; } @@ -246,6 +245,8 @@ public: /// been calculated by LEA. Also, remove redundant LEAs. bool runOnMachineFunction(MachineFunction &MF) override; + static char ID; + private: using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>; @@ -296,18 +297,18 @@ private: MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; - - static char ID; }; } // end anonymous namespace -char OptimizeLEAPass::ID = 0; +char X86OptimizeLEAPass::ID = 0; -FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } +FunctionPass *llvm::createX86OptimizeLEAs() { return new X86OptimizeLEAPass(); } +INITIALIZE_PASS(X86OptimizeLEAPass, DEBUG_TYPE, "X86 optimize LEA pass", false, + false) -int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, - const MachineInstr &Last) { +int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First, + const MachineInstr &Last) { // Both instructions must be in the same basic block and they must be // presented in InstrPos. assert(Last.getParent() == First.getParent() && @@ -328,10 +329,9 @@ int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, // 3) Displacement of the new memory operand should fit in 1 byte if possible. // 4) The LEA should be as close to MI as possible, and prior to it if // possible. -bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, - const MachineInstr &MI, - MachineInstr *&BestLEA, - int64_t &AddrDispShift, int &Dist) { +bool X86OptimizeLEAPass::chooseBestLEA( + const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI, + MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) { const MachineFunction *MF = MI.getParent()->getParent(); const MCInstrDesc &Desc = MI.getDesc(); int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) + @@ -387,9 +387,10 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, // Get the difference between the addresses' displacements of the two // instructions \p MI1 and \p MI2. The numbers of the first memory operands are // passed through \p N1 and \p N2. -int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1, - const MachineInstr &MI2, - unsigned N2) const { +int64_t X86OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, + unsigned N1, + const MachineInstr &MI2, + unsigned N2) const { const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp); const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp); @@ -411,9 +412,9 @@ int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1, // 2) Def registers of LEAs belong to the same class. // 3) All uses of the Last LEA def register are replaceable, thus the // register is used only as address base. -bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, - const MachineInstr &Last, - int64_t &AddrDispShift) const { +bool X86OptimizeLEAPass::isReplaceable(const MachineInstr &First, + const MachineInstr &Last, + int64_t &AddrDispShift) const { assert(isLEA(First) && isLEA(Last) && "The function works only with LEA instructions"); @@ -467,7 +468,8 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, return true; } -void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) { +void X86OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, + MemOpMap &LEAs) { unsigned Pos = 0; for (auto &MI : MBB) { // Assign the position number to the instruction. Note that we are going to @@ -485,7 +487,7 @@ void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) { // Try to find load and store instructions which recalculate addresses already // calculated by some LEA and replace their memory operands with its def // register. -bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { +bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { bool Changed = false; assert(!LEAs.empty()); @@ -564,9 +566,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { return Changed; } -MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, - unsigned VReg, - int64_t AddrDispShift) { +MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, + unsigned VReg, + int64_t AddrDispShift) { DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression()); if (AddrDispShift != 0) Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift); @@ -583,7 +585,7 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, } // Try to find similar LEAs in the list and replace one with another. -bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { +bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { bool Changed = false; // Loop over all entries in the table. @@ -613,8 +615,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { // Loop over all uses of the Last LEA and update their operands. Note // that the correctness of this has already been checked in the // isReplaceable function. - unsigned FirstVReg = First.getOperand(0).getReg(); - unsigned LastVReg = Last.getOperand(0).getReg(); + Register FirstVReg = First.getOperand(0).getReg(); + Register LastVReg = Last.getOperand(0).getReg(); for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end(); UI != UE;) { MachineOperand &MO = *UI++; @@ -670,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { return Changed; } -bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { +bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; if (DisableX86LEAOpt || skipFunction(MF.getFunction())) diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp index 78fede3dcde2..daddf4231897 100644 --- a/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -46,7 +46,9 @@ const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass( if (X86::GR8RegClass.hasSubClassEq(&RC) || X86::GR16RegClass.hasSubClassEq(&RC) || X86::GR32RegClass.hasSubClassEq(&RC) || - X86::GR64RegClass.hasSubClassEq(&RC)) + X86::GR64RegClass.hasSubClassEq(&RC) || + X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) || + X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC)) return getRegBank(X86::GPRRegBankID); if (X86::FR32XRegClass.hasSubClassEq(&RC) || diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 2e2f1f9e438a..ff625325b4c9 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -544,7 +544,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); + Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); @@ -677,13 +677,13 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { MI.getOperand(4).getImm() != 0 || MI.getOperand(5).getReg() != X86::NoRegister) return false; - unsigned BasePtr = MI.getOperand(1).getReg(); + Register BasePtr = MI.getOperand(1).getReg(); // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will // be replaced with a 32-bit operand MOV which will zero extend the upper // 32-bits of the super register. if (Opc == X86::LEA64_32r) BasePtr = getX86SubSuperRegister(BasePtr, 32); - unsigned NewDestReg = MI.getOperand(0).getReg(); + Register NewDestReg = MI.getOperand(0).getReg(); const X86InstrInfo *TII = MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo(); TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr, @@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { return true; } +static bool isFuncletReturnInstr(MachineInstr &MI) { + switch (MI.getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineInstr &MI = *II; - MachineFunction &MF = *MI.getParent()->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false + : isFuncletReturnInstr(*MBBI); const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); @@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && "Return instruction can only reference SP relative frame objects"); FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0); + } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) { + FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr); } else { FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr); } @@ -729,7 +746,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. // Don't change BasePtr since it is used later for stack adjustment. - unsigned MachineBasePtr = BasePtr; + Register MachineBasePtr = BasePtr; if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) MachineBasePtr = getX86SubSuperRegister(BasePtr, 64); @@ -773,7 +790,7 @@ Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { unsigned X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); - unsigned FrameReg = getFrameRegister(MF); + Register FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; @@ -782,7 +799,7 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { unsigned X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); - unsigned StackReg = getStackRegister(); + Register StackReg = getStackRegister(); if (Subtarget.isTarget64BitILP32()) StackReg = getX86SubSuperRegister(StackReg, 32); return StackReg; diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp index b435b22e8ac7..f8464c7e8298 100644 --- a/lib/Target/X86/X86RetpolineThunks.cpp +++ b/lib/Target/X86/X86RetpolineThunks.cpp @@ -58,8 +58,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineModuleInfo>(); - AU.addPreserved<MachineModuleInfo>(); + AU.addRequired<MachineModuleInfoWrapperPass>(); + AU.addPreserved<MachineModuleInfoWrapperPass>(); } private: @@ -97,7 +97,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) { TII = STI->getInstrInfo(); Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64; - MMI = &getAnalysis<MachineModuleInfo>(); + MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); Module &M = const_cast<Module &>(*MMI->getModule()); // If this function is not a thunk, check to see if we need to insert @@ -279,7 +279,7 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF, CallTarget->addLiveIn(Reg); CallTarget->setHasAddressTaken(); - CallTarget->setAlignment(4); + CallTarget->setAlignment(Align(16)); insertRegReturnAddrClobber(*CallTarget, Reg); CallTarget->back().setPreInstrSymbol(MF, TargetSym); BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 7574e4b8f896..9b1fcaa8a13d 100755 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -232,8 +232,12 @@ defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; -defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; + +defm : X86WriteRes<WriteFMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; + defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 284d1567c5c6..06f417501b21 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -231,8 +231,12 @@ defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; -defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; + +defm : X86WriteRes<WriteFMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; + defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td index 41bd776648f7..76001d382a27 100644 --- a/lib/Target/X86/X86SchedPredicates.td +++ b/lib/Target/X86/X86SchedPredicates.td @@ -84,3 +84,60 @@ def IsSETAm_Or_SETBEm : CheckAny<[ CheckImmOperand_s<5, "X86::COND_A">, CheckImmOperand_s<5, "X86::COND_BE"> ]>; + +// A predicate used to check if an instruction has a LOCK prefix. +def CheckLockPrefix : CheckFunctionPredicate< + "X86_MC::hasLockPrefix", + "X86InstrInfo::hasLockPrefix" +>; + +def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>; + +def IsRegMemCompareAndSwap_8 : CheckOpcode<[ + LCMPXCHG8, CMPXCHG8rm +]>; + +def IsRegRegCompareAndSwap_16_32_64 : CheckOpcode<[ + CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr +]>; + +def IsRegMemCompareAndSwap_16_32_64 : CheckOpcode<[ + CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm, + LCMPXCHG16, LCMPXCHG32, LCMPXCHG64, + LCMPXCHG8B, LCMPXCHG16B +]>; + +def IsCompareAndSwap8B : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>; +def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>; + +def IsRegMemCompareAndSwap : CheckOpcode< + !listconcat( + IsRegMemCompareAndSwap_8.ValidOpcodes, + IsRegMemCompareAndSwap_16_32_64.ValidOpcodes + )>; + +def IsRegRegCompareAndSwap : CheckOpcode< + !listconcat( + IsRegRegCompareAndSwap_8.ValidOpcodes, + IsRegRegCompareAndSwap_16_32_64.ValidOpcodes + )>; + +def IsAtomicCompareAndSwap_8 : CheckAll<[ + CheckLockPrefix, + IsRegMemCompareAndSwap_8 +]>; + +def IsAtomicCompareAndSwap : CheckAll<[ + CheckLockPrefix, + IsRegMemCompareAndSwap +]>; + +def IsAtomicCompareAndSwap8B : CheckAll<[ + CheckLockPrefix, + IsCompareAndSwap8B +]>; + +def IsAtomicCompareAndSwap16B : CheckAll<[ + CheckLockPrefix, + IsCompareAndSwap16B +]>; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index d40bdf728a48..26d4d8fa3549 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -208,8 +208,12 @@ defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>; -defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; -defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; + +defm : X86WriteRes<WriteFMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; + defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 8f3e4ae62d53..9a511ecc0071 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; + defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 58caf1dacfcb..a8c65435ab9b 100755 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; -defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; + defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 55ca85ec1e3d..95f710061aeb 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -102,6 +102,12 @@ class X86SchedWriteMoveLS<SchedWrite MoveRR, SchedWrite MR = StoreMR; } +// Multiclass that wraps masked load/store writes for a vector width. +class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> { + SchedWrite RM = LoadRM; + SchedWrite MR = StoreMR; +} + // Multiclass that wraps X86SchedWriteMoveLS for each vector width. class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl, X86SchedWriteMoveLS s128, @@ -218,8 +224,12 @@ def WriteFStoreY : SchedWrite; def WriteFStoreNT : SchedWrite; def WriteFStoreNTX : SchedWrite; def WriteFStoreNTY : SchedWrite; -def WriteFMaskedStore : SchedWrite; -def WriteFMaskedStoreY : SchedWrite; + +def WriteFMaskedStore32 : SchedWrite; +def WriteFMaskedStore64 : SchedWrite; +def WriteFMaskedStore32Y : SchedWrite; +def WriteFMaskedStore64Y : SchedWrite; + def WriteFMove : SchedWrite; def WriteFMoveX : SchedWrite; def WriteFMoveY : SchedWrite; @@ -530,6 +540,16 @@ def SchedWriteVecMoveLSNT : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX, WriteVecMoveLSNTY, WriteVecMoveLSNTY>; +// Conditional SIMD Packed Loads and Stores wrappers. +def WriteFMaskMove32 + : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>; +def WriteFMaskMove64 + : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>; +def WriteFMaskMove32Y + : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>; +def WriteFMaskMove64Y + : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>; + // Vector width wrappers. def SchedWriteFAdd : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index b0334655de7e..78acb1065ec8 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -216,8 +216,10 @@ defm : X86WriteResUnsupported<WriteFStoreY>; def : WriteRes<WriteFStoreNT, [AtomPort0]>; def : WriteRes<WriteFStoreNTX, [AtomPort0]>; defm : X86WriteResUnsupported<WriteFStoreNTY>; -defm : X86WriteResUnsupported<WriteFMaskedStore>; -defm : X86WriteResUnsupported<WriteFMaskedStoreY>; +defm : X86WriteResUnsupported<WriteFMaskedStore32>; +defm : X86WriteResUnsupported<WriteFMaskedStore32Y>; +defm : X86WriteResUnsupported<WriteFMaskedStore64>; +defm : X86WriteResUnsupported<WriteFMaskedStore64Y>; def : WriteRes<WriteFMove, [AtomPort01]>; def : WriteRes<WriteFMoveX, [AtomPort01]>; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index 8cc01c3acece..d7aea3cf4e9d 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -726,8 +726,10 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>; -defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; -defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; +defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; +defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; +defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; +defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>; defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>; diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 2d26232b4132..d0421d94ee05 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -180,9 +180,11 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, // Instructions that have local forwarding disabled have an extra +1cy latency. -// A folded store needs a cycle on the SAGU for the store data, -// most RMW instructions don't need an extra uop. -defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>; +// A folded store needs a cycle on the SAGU for the store data, most RMW +// instructions don't need an extra uop. ALU RMW operations don't seem to +// benefit from STLF, and their observed latency is 6cy. That is the reason why +// this write adds two extra cycles (instead of just 1cy for the store). +defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>; //////////////////////////////////////////////////////////////////////////////// // Arithmetic. @@ -191,22 +193,22 @@ defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>; defm : JWriteResIntPair<WriteALU, [JALU01], 1>; defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; -defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; -defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; -defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>; -defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>; -defm : X86WriteRes<WriteXCHG, [JALU01], 1, [1], 1>; - -defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>; -defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; -defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>; -defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>; +defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>; +defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>; +defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>; + +defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>; +defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>; +defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>; +defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>; +defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; +defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>; +defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>; defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; @@ -305,6 +307,192 @@ def : WriteRes<WriteFence, [JSAGU]>; // to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } +def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> { + let Latency = 3; + let ResourceCycles = [3]; + let NumMicroOps = 3; +} + +def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 16; + let ResourceCycles = [3,16,16]; + let NumMicroOps = 5; +} + +def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 17; + let ResourceCycles = [3,17,17]; + let NumMicroOps = 6; +} + +def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 11; + let ResourceCycles = [3,1,1]; + let NumMicroOps = 5; +} + +def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 11; + let ResourceCycles = [3,1,1]; + let NumMicroOps = 18; +} + +def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 32; + let ResourceCycles = [6,1,1]; + let NumMicroOps = 28; +} + +def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 19; + let ResourceCycles = [3,19,19]; + let NumMicroOps = 18; +} + +def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 38; + let ResourceCycles = [6,38,38]; + let NumMicroOps = 28; +} + +def JWriteCMPXCHGVariant : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>, + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>, + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>, + SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>, + SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>, + SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>, + SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>, + SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>, + SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>, + SchedVar<NoSchedPred, [WriteCMPXCHG]> +]>; + +// The first five reads are contributed by the memory load operand. +// We ignore those reads and set a read-advance for the other input operands +// including the implicit read of RAX. +def : InstRW<[JWriteCMPXCHGVariant, + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16, + LCMPXCHG32, LCMPXCHG64, + CMPXCHG8rm, CMPXCHG16rm, + CMPXCHG32rm, CMPXCHG64rm)>; + +def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr, + CMPXCHG32rr, CMPXCHG64rr)>; + +def : InstRW<[JWriteCMPXCHGVariant, + // Ignore reads contributed by the memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Add a read-advance to every implicit register read. + ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B, + CMPXCHG8B, CMPXCHG16B)>; + +def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> { + let Latency = 19; + let ResourceCycles = [1,19,19]; + let NumMicroOps = 1; +} + +def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>, + SchedVar<NoSchedPred, [WriteALURMW]> +]>; +def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m, + DEC8m, DEC16m, DEC32m, DEC64m, + NOT8m, NOT16m, NOT32m, NOT64m, + NEG8m, NEG16m, NEG32m, NEG64m)>; + +def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> { + let Latency = 2; + let ResourceCycles = [3]; + let NumMicroOps = 3; +} +def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr, + XADD32rr, XADD64rr)>; + +// This write defines the latency of the in/out register operand of a non-atomic +// XADDrm. This is the first of a pair of writes that model non-atomic +// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part). +// +// We need two writes because the instruction latency differs from the output +// register operand latency. In particular, the first write describes the first +// (and only) output register operand of the instruction. However, the +// instruction latency is set to the MAX of all the write latencies. That's why +// a second write is needed in this case (see example below). +// +// Example: +// XADD %ecx, (%rsp) ## Instruction latency: 11cy +// ## ECX write Latency: 3cy +// +// Register ECX becomes available in 3 cycles. That is because the value of ECX +// is exchanged with the value read from the stack pointer, and the load-to-use +// latency is assumed to be 3cy. +def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { + let Latency = 3; // load-to-use latency + let ResourceCycles = [3]; + let NumMicroOps = 3; +} + +// This write defines the latency of the in/out register operand of an atomic +// XADDrm. This is the first of a sequence of two writes used to model atomic +// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part. +// +// +// Example: +// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy +// ## ECX write Latency: 11cy +// +// The value of ECX becomes available only after 11cy from the start of +// execution. This write is used to specifically set that operand latency. +def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> { + let Latency = 11; + let ResourceCycles = [3]; + let NumMicroOps = 3; +} + +// This write defines the latency of the in/out register operand of an atomic +// XCHGrm. This write is the first of a sequence of two writes that describe +// atomic XCHG operations. We need two writes because the instruction latency +// differs from the output register write latency. We want to make sure that +// the output register operand becomes visible after 11cy. However, we want to +// set the instruction latency to 16cy. +def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> { + let Latency = 11; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { + let Latency = 11; + let ResourceCycles = [1, 1]; + let NumMicroOps = 1; +} + +def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> { + let Latency = 16; + let ResourceCycles = [16, 16]; + let NumMicroOps = 1; +} + +def JWriteXADDrm_Part1 : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>, + SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]> +]>; + +def JWriteXADDrm_Part2 : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>, + SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]> +]>; + +def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd], + (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm, + LXADD8, LXADD16, LXADD32, LXADD64)>; + +def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd], + (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>; + + //////////////////////////////////////////////////////////////////////////////// // Floating point. This covers both scalar and vector operations. //////////////////////////////////////////////////////////////////////////////// @@ -313,19 +501,22 @@ defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>; defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; -defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; -defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>; -defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>; +defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>; +defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>; +defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>; defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; @@ -466,8 +657,8 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; //////////////////////////////////////////////////////////////////////////////// defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; -defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>; defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; @@ -475,7 +666,7 @@ defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; -defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; @@ -631,6 +822,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> { def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; /////////////////////////////////////////////////////////////////////////////// +// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ +/////////////////////////////////////////////////////////////////////////////// + +def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { + let Latency = 34; + let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; + let NumMicroOps = 63; +} +def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, + VMASKMOVDQU, VMASKMOVDQU64)>; + +/////////////////////////////////////////////////////////////////////////////// // SchedWriteVariant definitions. /////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 34c251a5c5bb..8e3ce721f1a1 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -186,8 +186,12 @@ def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>; -def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>; -def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>; + +def : WriteRes<WriteFMaskedStore32, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore32Y, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore64, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>; + def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>; diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 65f6d89df610..06201f4a3a84 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -268,8 +268,12 @@ defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>; -defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; -defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; + +defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; + defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 50690953eef5..1ae8df977f83 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -36,7 +36,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( DAG.getSubtarget().getRegisterInfo()); - unsigned BaseReg = TRI->getBaseRegister(); + Register BaseReg = TRI->getBaseRegister(); for (unsigned R : ClobberSet) if (BaseReg == R) return true; diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 40f5dbe57e4b..b8980789258e 100644 --- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -477,7 +477,7 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction( // Otherwise, just build the predicate state itself by zeroing a register // as we don't need any initial state. PS->InitialReg = MRI->createVirtualRegister(PS->RC); - unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0), PredStateSubReg); ++NumInstsInserted; @@ -750,7 +750,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG( int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); - unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); + Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC); // Note that we intentionally use an empty debug location so that // this picks up the preceding location. auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(), @@ -907,7 +907,7 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( MI.dump(); dbgs() << "\n"); report_fatal_error("Unable to unfold load!"); } - unsigned Reg = MRI->createVirtualRegister(UnfoldedRC); + Register Reg = MRI->createVirtualRegister(UnfoldedRC); SmallVector<MachineInstr *, 2> NewMIs; // If we were able to compute an unfolded reg class, any failure here // is just a programming error so just assert. @@ -1102,7 +1102,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // synthetic target in the predecessor. We do this at the bottom of the // predecessor. auto InsertPt = Pred->getFirstTerminator(); - unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass); if (MF.getTarget().getCodeModel() == CodeModel::Small && !Subtarget->isPositionIndependent()) { // Directly materialize it into an immediate. @@ -1153,7 +1153,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n"); } else { // Otherwise compute the address into a register first. - unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass); + Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass); auto AddrI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg) .addReg(/*Base*/ X86::RIP) @@ -1175,7 +1175,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( // Now cmov over the predicate if the comparison wasn't equal. int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); - unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); + Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg) .addReg(PS->InitialReg) @@ -1878,7 +1878,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS( DebugLoc Loc) { // FIXME: Hard coding this to a 32-bit register class seems weird, but matches // what instruction selection does. - unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass); + Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass); // We directly copy the FLAGS register and rely on later lowering to clean // this up into the appropriate setCC instructions. BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS); @@ -1905,7 +1905,7 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS( void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, unsigned PredStateReg) { - unsigned TmpReg = MRI->createVirtualRegister(PS->RC); + Register TmpReg = MRI->createVirtualRegister(PS->RC); // FIXME: This hard codes a shift distance based on the number of bits needed // to stay canonical on 64-bit. We should compute this somehow and support // 32-bit as part of that. @@ -1925,8 +1925,8 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP( MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { - unsigned PredStateReg = MRI->createVirtualRegister(PS->RC); - unsigned TmpReg = MRI->createVirtualRegister(PS->RC); + Register PredStateReg = MRI->createVirtualRegister(PS->RC); + Register TmpReg = MRI->createVirtualRegister(PS->RC); // We know that the stack pointer will have any preserved predicate state in // its high bit. We just want to smear this across the other bits. Turns out, @@ -2031,9 +2031,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( } for (MachineOperand *Op : HardenOpRegs) { - unsigned OpReg = Op->getReg(); + Register OpReg = Op->getReg(); auto *OpRC = MRI->getRegClass(OpReg); - unsigned TmpReg = MRI->createVirtualRegister(OpRC); + Register TmpReg = MRI->createVirtualRegister(OpRC); // If this is a vector register, we'll need somewhat custom logic to handle // hardening it. @@ -2045,7 +2045,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( // Move our state into a vector register. // FIXME: We could skip this at the cost of longer encodings with AVX-512 // but that doesn't seem likely worth it. - unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass); + Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass); auto MovI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg) .addReg(StateReg); @@ -2054,7 +2054,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n"); // Broadcast it across the vector register. - unsigned VBStateReg = MRI->createVirtualRegister(OpRC); + Register VBStateReg = MRI->createVirtualRegister(OpRC); auto BroadcastI = BuildMI(MBB, InsertPt, Loc, TII->get(Is128Bit ? X86::VPBROADCASTQrr : X86::VPBROADCASTQYrr), @@ -2084,7 +2084,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!"); // Broadcast our state into a vector register. - unsigned VStateReg = MRI->createVirtualRegister(OpRC); + Register VStateReg = MRI->createVirtualRegister(OpRC); unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128r : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr; @@ -2153,7 +2153,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // See if we can sink hardening the loaded value. auto SinkCheckToSingleUse = [&](MachineInstr &MI) -> Optional<MachineInstr *> { - unsigned DefReg = MI.getOperand(0).getReg(); + Register DefReg = MI.getOperand(0).getReg(); // We need to find a single use which we can sink the check. We can // primarily do this because many uses may already end up checked on their @@ -2210,8 +2210,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // If this register isn't a virtual register we can't walk uses of sanely, // just bail. Also check that its register class is one of the ones we // can harden. - unsigned UseDefReg = UseMI.getOperand(0).getReg(); - if (!TRI->isVirtualRegister(UseDefReg) || + Register UseDefReg = UseMI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(UseDefReg) || !canHardenRegister(UseDefReg)) return {}; @@ -2241,6 +2241,9 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) { // We don't support post-load hardening of vectors. return false; + unsigned RegIdx = Log2_32(RegBytes); + assert(RegIdx < 4 && "Unsupported register size"); + // If this register class is explicitly constrained to a class that doesn't // require REX prefix, we may not be able to satisfy that constraint when // emitting the hardening instructions, so bail out here. @@ -2251,13 +2254,13 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) { const TargetRegisterClass *NOREXRegClasses[] = { &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass, &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass}; - if (RC == NOREXRegClasses[Log2_32(RegBytes)]) + if (RC == NOREXRegClasses[RegIdx]) return false; const TargetRegisterClass *GPRRegClasses[] = { &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass, &X86::GR64RegClass}; - return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]); + return RC->hasSuperClassEq(GPRRegClasses[RegIdx]); } /// Harden a value in a register. @@ -2278,7 +2281,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc) { assert(canHardenRegister(Reg) && "Cannot harden this register!"); - assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!"); + assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!"); auto *RC = MRI->getRegClass(Reg); int Bytes = TRI->getRegSizeInBits(*RC) / 8; @@ -2289,7 +2292,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( if (Bytes != 8) { unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit}; unsigned SubRegImm = SubRegImms[Log2_32(Bytes)]; - unsigned NarrowStateReg = MRI->createVirtualRegister(RC); + Register NarrowStateReg = MRI->createVirtualRegister(RC); BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg) .addReg(StateReg, 0, SubRegImm); StateReg = NarrowStateReg; @@ -2299,7 +2302,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( if (isEFLAGSLive(MBB, InsertPt, *TRI)) FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); - unsigned NewReg = MRI->createVirtualRegister(RC); + Register NewReg = MRI->createVirtualRegister(RC); unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr}; unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)]; auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg) @@ -2329,13 +2332,13 @@ unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) { DebugLoc Loc = MI.getDebugLoc(); auto &DefOp = MI.getOperand(0); - unsigned OldDefReg = DefOp.getReg(); + Register OldDefReg = DefOp.getReg(); auto *DefRC = MRI->getRegClass(OldDefReg); // Because we want to completely replace the uses of this def'ed value with // the hardened value, create a dedicated new register that will only be used // to communicate the unhardened value to the hardening. - unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC); + Register UnhardenedReg = MRI->createVirtualRegister(DefRC); DefOp.setReg(UnhardenedReg); // Now harden this register's value, getting a hardened reg that is safe to @@ -2537,7 +2540,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( .addReg(ExpectedRetAddrReg, RegState::Kill) .addSym(RetSymbol); } else { - unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC); + Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC); BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg) .addReg(/*Base*/ X86::RIP) .addImm(/*Scale*/ 1) @@ -2554,7 +2557,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall( int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8; auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes); - unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC); + Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC); auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg) .addReg(NewStateReg, RegState::Kill) .addReg(PS->PoisonReg) @@ -2611,7 +2614,7 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr( // For all of these, the target register is the first operand of the // instruction. auto &TargetOp = MI.getOperand(0); - unsigned OldTargetReg = TargetOp.getReg(); + Register OldTargetReg = TargetOp.getReg(); // Try to lookup a hardened version of this register. We retain a reference // here as we want to update the map to track any newly computed hardened diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index d5bb56603df9..f8f78da52cc2 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -146,6 +146,9 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV, return X86II::MO_DLLIMPORT; return X86II::MO_COFFSTUB; } + // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables. + if (isOSWindows()) + return X86II::MO_NO_FLAG; if (is64Bit()) { // ELF supports a large, truly PIC code model with non-PC relative GOT @@ -285,10 +288,10 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both // 32 and 64 bit) and for all 64-bit targets. if (StackAlignOverride) - stackAlignment = StackAlignOverride; + stackAlignment = *StackAlignOverride; else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || isTargetKFreeBSD() || In64BitMode) - stackAlignment = 16; + stackAlignment = Align(16); // Some CPUs have more overhead for gather. The specified overhead is relative // to the Load operation. "2" is the number provided by Intel architects. This @@ -304,6 +307,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // Consume the vector width attribute or apply any target specific limit. if (PreferVectorWidthOverride) PreferVectorWidth = PreferVectorWidthOverride; + else if (Prefer128Bit) + PreferVectorWidth = 128; else if (Prefer256Bit) PreferVectorWidth = 256; } @@ -316,12 +321,11 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, - unsigned StackAlignOverride, + MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth) - : X86GenSubtargetInfo(TT, CPU, FS), - PICStyle(PICStyles::None), TM(TM), TargetTriple(TT), - StackAlignOverride(StackAlignOverride), + : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM), + TargetTriple(TT), StackAlignOverride(StackAlignOverride), PreferVectorWidthOverride(PreferVectorWidthOverride), RequiredVectorWidth(RequiredVectorWidth), In64BitMode(TargetTriple.getArch() == Triple::x86_64), @@ -355,7 +359,7 @@ const CallLowering *X86Subtarget::getCallLowering() const { return CallLoweringInfo.get(); } -const InstructionSelector *X86Subtarget::getInstructionSelector() const { +InstructionSelector *X86Subtarget::getInstructionSelector() const { return InstSelector.get(); } diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 24ccc9cb7843..e8efe8f2afe5 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -365,8 +365,8 @@ protected: /// Processor has AVX-512 vp2intersect instructions bool HasVP2INTERSECT = false; - /// Processor supports MPX - Memory Protection Extensions - bool HasMPX = false; + /// Deprecated flag for MPX instructions. + bool DeprecatedHasMPX = false; /// Processor supports CET SHSTK - Control-Flow Enforcement Technology /// using Shadow Stack @@ -427,15 +427,21 @@ protected: /// Use software floating point for code generation. bool UseSoftFloat = false; + /// Use alias analysis during code generation. + bool UseAA = false; + /// The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. - unsigned stackAlignment = 4; + Align stackAlignment = Align(4); /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. /// // FIXME: this is a known good value for Yonah. How about others? unsigned MaxInlineSizeThreshold = 128; + /// Indicates target prefers 128 bit instructions. + bool Prefer128Bit = false; + /// Indicates target prefers 256 bit instructions. bool Prefer256Bit = false; @@ -453,7 +459,7 @@ protected: private: /// Override the stack alignment. - unsigned StackAlignOverride; + MaybeAlign StackAlignOverride; /// Preferred vector width from function attribute. unsigned PreferVectorWidthOverride; @@ -490,7 +496,7 @@ public: /// of the specified triple. /// X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, - const X86TargetMachine &TM, unsigned StackAlignOverride, + const X86TargetMachine &TM, MaybeAlign StackAlignOverride, unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth); @@ -515,7 +521,7 @@ public: /// Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. - unsigned getStackAlignment() const { return stackAlignment; } + Align getStackAlignment() const { return stackAlignment; } /// Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. @@ -527,7 +533,7 @@ public: /// Methods used by Global ISel const CallLowering *getCallLowering() const override; - const InstructionSelector *getInstructionSelector() const override; + InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; @@ -684,7 +690,6 @@ public: bool hasBF16() const { return HasBF16; } bool hasVP2INTERSECT() const { return HasVP2INTERSECT; } bool hasBITALG() const { return HasBITALG; } - bool hasMPX() const { return HasMPX; } bool hasSHSTK() const { return HasSHSTK; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } bool hasCLWB() const { return HasCLWB; } @@ -739,6 +744,7 @@ public: X86ProcFamily == IntelTRM; } bool useSoftFloat() const { return UseSoftFloat; } + bool useAA() const override { return UseAA; } /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for /// no-sse2). There isn't any reason to disable it if the target processor @@ -809,6 +815,7 @@ public: // On Win64, all these conventions just use the default convention. case CallingConv::C: case CallingConv::Fast: + case CallingConv::Tail: case CallingConv::Swift: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 0cbf13899a29..c15297134e4d 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -81,27 +81,28 @@ extern "C" void LLVMInitializeX86Target() { initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); + initializeX86OptimizeLEAPassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) { if (TT.getArch() == Triple::x86_64) - return llvm::make_unique<X86_64MachoTargetObjectFile>(); - return llvm::make_unique<TargetLoweringObjectFileMachO>(); + return std::make_unique<X86_64MachoTargetObjectFile>(); + return std::make_unique<TargetLoweringObjectFileMachO>(); } if (TT.isOSFreeBSD()) - return llvm::make_unique<X86FreeBSDTargetObjectFile>(); + return std::make_unique<X86FreeBSDTargetObjectFile>(); if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) - return llvm::make_unique<X86LinuxNaClTargetObjectFile>(); + return std::make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSSolaris()) - return llvm::make_unique<X86SolarisTargetObjectFile>(); + return std::make_unique<X86SolarisTargetObjectFile>(); if (TT.isOSFuchsia()) - return llvm::make_unique<X86FuchsiaTargetObjectFile>(); + return std::make_unique<X86FuchsiaTargetObjectFile>(); if (TT.isOSBinFormatELF()) - return llvm::make_unique<X86ELFTargetObjectFile>(); + return std::make_unique<X86ELFTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) - return llvm::make_unique<TargetLoweringObjectFileCOFF>(); + return std::make_unique<TargetLoweringObjectFileCOFF>(); llvm_unreachable("unknown subtarget type"); } @@ -116,6 +117,9 @@ static std::string computeDataLayout(const Triple &TT) { !TT.isArch64Bit()) Ret += "-p:32:32"; + // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers. + Ret += "-p270:32:32-p271:32:32-p272:64:64"; + // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) Ret += "-i64:64"; @@ -218,17 +222,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL), TLOF(createTLOF(getTargetTriple())) { - // Windows stack unwinder gets confused when execution flow "falls through" - // after a call to 'noreturn' function. - // To prevent that, we emit a trap for 'unreachable' IR instructions. - // (which on X86, happens to be the 'ud2' instruction) // On PS4, the "return address" of a 'noreturn' call must still be within // the calling function, and TrapUnreachable is an easy way to get that. - // The check here for 64-bit windows is a bit icky, but as we're unlikely - // to ever want to mix 32 and 64-bit windows code in a single module - // this should be fine. - if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() || - TT.isOSBinFormatMachO()) { + if (TT.isPS4() || TT.isOSBinFormatMachO()) { this->Options.TrapUnreachable = true; this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO(); } @@ -311,10 +307,10 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, - Options.StackAlignmentOverride, - PreferVectorWidthOverride, - RequiredVectorWidth); + I = std::make_unique<X86Subtarget>( + TargetTriple, CPU, FS, *this, + MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride, + RequiredVectorWidth); } return I.get(); } @@ -517,12 +513,19 @@ void X86PassConfig::addPreEmitPass() { } void X86PassConfig::addPreEmitPass2() { + const Triple &TT = TM->getTargetTriple(); + const MCAsmInfo *MAI = TM->getMCAsmInfo(); + addPass(createX86RetpolineThunksPass()); + + // Insert extra int3 instructions after trailing call instructions to avoid + // issues in the unwinder. + if (TT.isOSWindows() && TT.getArch() == Triple::x86_64) + addPass(createX86AvoidTrailingCallPass()); + // Verify basic block incoming and outgoing cfa offset and register values and // correct CFA calculation rule where needed by inserting appropriate CFI // instructions. - const Triple &TT = TM->getTargetTriple(); - const MCAsmInfo *MAI = TM->getMCAsmInfo(); if (!TT.isOSDarwin() && (!TT.isOSWindows() || MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI)) diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index b999e2e86af6..ec3db7b1e9e8 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -16,7 +16,6 @@ #include "X86Subtarget.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/CodeGen.h" #include "llvm/Target/TargetMachine.h" #include <memory> @@ -26,6 +25,7 @@ namespace llvm { class StringRef; class X86Subtarget; class X86RegisterBankInfo; +class TargetTransformInfo; class X86TargetMachine final : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 92e0779c2e74..44185957686b 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -47,8 +47,8 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol( } const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel( - const MCSymbol *Sym, const MCValue &MV, int64_t Offset, - MachineModuleInfo *MMI, MCStreamer &Streamer) const { + const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV, + int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const { // On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry // from a data section. In case there's an additional offset, then use // foo@GOTPCREL+4+<offset>. diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index 13d7b4ad70d6..1fd0bbf56b19 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -30,7 +30,8 @@ namespace llvm { const TargetMachine &TM, MachineModuleInfo *MMI) const override; - const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym, + const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV, + const MCSymbol *Sym, const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 3dc59aeb263e..70fd857fcf01 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } -unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { +unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { + bool Vector = (ClassID == 1); if (Vector && !ST->hasSSE1()) return 0; @@ -887,7 +888,7 @@ int X86TTIImpl::getArithmeticInstrCost( int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. - // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. + // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // Treat Transpose as 2-op shuffles - there's no difference in lowering. @@ -911,6 +912,39 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) return SubLT.first; + // Handle some cases for widening legalization. For now we only handle + // cases where the original subvector was naturally aligned and evenly + // fit in its legalized subvector type. + // FIXME: Remove some of the alignment restrictions. + // FIXME: We can use permq for 64-bit or larger extracts from 256-bit + // vectors. + int OrigSubElts = SubTp->getVectorNumElements(); + if (NumSubElts > OrigSubElts && + (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 && + LT.second.getVectorElementType() == + SubLT.second.getVectorElementType() && + LT.second.getVectorElementType().getSizeInBits() == + Tp->getVectorElementType()->getPrimitiveSizeInBits()) { + assert(NumElts >= NumSubElts && NumElts > OrigSubElts && + "Unexpected number of elements!"); + Type *VecTy = VectorType::get(Tp->getVectorElementType(), + LT.second.getVectorNumElements()); + Type *SubTy = VectorType::get(Tp->getVectorElementType(), + SubLT.second.getVectorNumElements()); + int ExtractIndex = alignDown((Index % NumElts), NumSubElts); + int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, + ExtractIndex, SubTy); + + // If the original size is 32-bits or more, we can use pshufd. Otherwise + // if we have SSSE3 we can use pshufb. + if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) + return ExtractCost + 1; // pshufd or pshufb + + assert(SubTp->getPrimitiveSizeInBits() == 16 && + "Unexpected vector size"); + + return ExtractCost + 2; // worst case pshufhw + pshufd + } } } @@ -1314,8 +1348,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, @@ -1354,6 +1390,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, @@ -1371,14 +1409,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, @@ -1402,13 +1440,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, @@ -1421,7 +1459,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, @@ -1507,6 +1548,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, }; @@ -1520,7 +1562,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, @@ -1536,6 +1579,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, @@ -1562,15 +1607,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW + { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD }; std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); @@ -1691,6 +1742,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, } } + static const CostTblEntry SLMCostTbl[] = { + // slm pcmpeq/pcmpgt throughput is 2 + { ISD::SETCC, MVT::v2i64, 2 }, + }; + static const CostTblEntry AVX512BWCostTbl[] = { { ISD::SETCC, MVT::v32i16, 1 }, { ISD::SETCC, MVT::v64i8, 1 }, @@ -1777,6 +1833,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps }; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) + return LT.first * (ExtraCost + Entry->Cost); + if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) return LT.first * (ExtraCost + Entry->Cost); @@ -2043,8 +2103,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; + static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets + { ISD::CTLZ, MVT::i64, 1 }, + }; + static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTLZ, MVT::i32, 1 }, + { ISD::CTLZ, MVT::i16, 1 }, + { ISD::CTLZ, MVT::i8, 1 }, + }; + static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets + { ISD::CTPOP, MVT::i64, 1 }, + }; + static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTPOP, MVT::i32, 1 }, + { ISD::CTPOP, MVT::i16, 1 }, + { ISD::CTPOP, MVT::i8, 1 }, + }; static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTPOP, MVT::i64, 10 }, { ISD::SADDO, MVT::i64, 1 }, { ISD::UADDO, MVT::i64, 1 }, }; @@ -2052,6 +2130,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTPOP, MVT::i32, 8 }, + { ISD::CTPOP, MVT::i16, 9 }, + { ISD::CTPOP, MVT::i8, 7 }, { ISD::SADDO, MVT::i32, 1 }, { ISD::SADDO, MVT::i16, 1 }, { ISD::SADDO, MVT::i8, 1 }, @@ -2163,6 +2247,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasLZCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + if (ST->hasPOPCNT()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + + // TODO - add BMI (TZCNT) scalar handling + if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) return LT.first * Entry->Cost; @@ -2357,8 +2461,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) || - (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { + if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) || + (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) || + !isPowerOf2_32(NumElem)) { // Scalarization int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); int ScalarCompareCost = getCmpSelInstrCost( @@ -2425,70 +2530,107 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) { - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - - MVT MTy = LT.second; - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SSE42CostTblPairWise[] = { + static const CostTblEntry SSE2CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32. { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". + { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 + { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 { ISD::ADD, MVT::v8i16, 5 }, + { ISD::ADD, MVT::v2i8, 2 }, + { ISD::ADD, MVT::v4i8, 2 }, + { ISD::ADD, MVT::v8i8, 2 }, + { ISD::ADD, MVT::v16i8, 3 }, }; static const CostTblEntry AVX1CostTblPairWise[] = { - { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". - { ISD::ADD, MVT::v8i16, 5 }, { ISD::ADD, MVT::v8i32, 5 }, + { ISD::ADD, MVT::v16i16, 6 }, + { ISD::ADD, MVT::v32i8, 4 }, }; - static const CostTblEntry SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE2CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". + { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". + { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". + { ISD::ADD, MVT::v2i8, 2 }, + { ISD::ADD, MVT::v4i8, 2 }, + { ISD::ADD, MVT::v8i8, 2 }, + { ISD::ADD, MVT::v16i8, 3 }, }; static const CostTblEntry AVX1CostTblNoPairWise[] = { - { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, + { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v8f32, 4 }, { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". { ISD::ADD, MVT::v4i64, 3 }, - { ISD::ADD, MVT::v8i16, 4 }, { ISD::ADD, MVT::v8i32, 5 }, + { ISD::ADD, MVT::v16i16, 5 }, + { ISD::ADD, MVT::v32i8, 4 }, }; + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // Before legalizing the type, give a chance to look up illegal narrow types + // in the table. + // FIXME: Is there a better way to do this? + EVT VT = TLI->getValueType(DL, ValTy); + if (VT.isSimple()) { + MVT MTy = VT.getSimpleVT(); + if (IsPairwise) { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) + return Entry->Cost; + } else { + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; + } + } + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + if (IsPairwise) { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) return LT.first * Entry->Cost; } else { if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) return LT.first * Entry->Cost; } @@ -3116,7 +3258,7 @@ bool X86TTIImpl::canMacroFuseCmp() { return ST->hasMacroFusion() || ST->hasBranchFusion(); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { if (!ST->hasAVX()) return false; @@ -3139,11 +3281,11 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { - return isLegalMaskedLoad(DataType); +bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + return isLegalMaskedLoad(DataType, Alignment); } -bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { +bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { unsigned DataSize = DL.getTypeStoreSize(DataType); // The only supported nontemporal loads are for aligned vectors of 16 or 32 // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 @@ -3154,7 +3296,7 @@ bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) { return false; } -bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) { +bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { unsigned DataSize = DL.getTypeStoreSize(DataType); // SSE4A supports nontemporal stores of float and double at arbitrary @@ -3299,9 +3441,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { if (IsZeroCmp) { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). - // TODO: enable AVX512 when the DAG is ready. - // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); const unsigned PreferredWidth = ST->getPreferVectorWidth(); + if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); // All GPR and vector loads can be unaligned. SIMD compare requires integer diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 25d9c33eb16d..7581257f41f8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -83,6 +83,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { X86::FeatureSlowUAMem32, // Based on whether user set the -mprefer-vector-width command line. + X86::FeaturePrefer128Bit, X86::FeaturePrefer256Bit, // CPU name enums. These just follow CPU string. @@ -115,7 +116,7 @@ public: /// \name Vector TTI Implementations /// @{ - unsigned getNumberOfRegisters(bool Vector); + unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); @@ -184,10 +185,10 @@ public: bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); - bool isLegalMaskedLoad(Type *DataType); - bool isLegalMaskedStore(Type *DataType); - bool isLegalNTLoad(Type *DataType, unsigned Alignment); - bool isLegalNTStore(Type *DataType, unsigned Alignment); + bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment); + bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment); + bool isLegalNTLoad(Type *DataType, Align Alignment); + bool isLegalNTStore(Type *DataType, Align Alignment); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); bool isLegalMaskedExpandLoad(Type *DataType); diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index a07d2f20acab..9280d030b5d5 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -292,8 +292,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { // need to insert any VZEROUPPER instructions. This is constant-time, so it // is cheap in the common case of no ymm/zmm use. bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm; - const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass}; - for (auto *RC : RCs) { + for (auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) { if (!YmmOrZmmUsed) { for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; i++) { @@ -304,9 +303,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { } } } - if (!YmmOrZmmUsed) { + if (!YmmOrZmmUsed) return false; - } assert(BlockStates.empty() && DirtySuccessors.empty() && "X86VZeroUpper state should be clear"); diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp index 9e499db1d7ee..ae72c6427588 100644 --- a/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/lib/Target/X86/X86WinAllocaExpander.cpp @@ -81,7 +81,7 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) { MI->getOpcode() == X86::WIN_ALLOCA_64); assert(MI->getOperand(0).isReg()); - unsigned AmountReg = MI->getOperand(0).getReg(); + Register AmountReg = MI->getOperand(0).getReg(); MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg); if (!Def || @@ -261,7 +261,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) { break; } - unsigned AmountReg = MI->getOperand(0).getReg(); + Register AmountReg = MI->getOperand(0).getReg(); MI->eraseFromParent(); // Delete the definition of AmountReg. diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index f68d17d7256d..d65e1f3ab414 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -339,7 +339,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { if (UseStackGuard) { Value *Val = Builder.CreateLoad(Int32Ty, Cookie); Value *FrameAddr = Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress), + Intrinsic::getDeclaration( + TheModule, Intrinsic::frameaddress, + Builder.getInt8PtrTy( + TheModule->getDataLayout().getAllocaAddrSpace())), Builder.getInt32(0), "frameaddr"); Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty); FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val); |