summaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2019-10-23 17:51:42 +0000
commit1d5ae1026e831016fc29fd927877c86af904481f (patch)
tree2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Target/X86
parente6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp170
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParserCommon.h4
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h25
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp5
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp6
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp19
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp61
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp7
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp2
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp5
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp2
-rw-r--r--lib/Target/X86/X86.h10
-rw-r--r--lib/Target/X86/X86.td56
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp8
-rw-r--r--lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp3
-rw-r--r--lib/Target/X86/X86AvoidTrailingCall.cpp108
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp26
-rw-r--r--lib/Target/X86/X86CallLowering.cpp49
-rw-r--r--lib/Target/X86/X86CallLowering.h5
-rw-r--r--lib/Target/X86/X86CallingConv.td2
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp18
-rw-r--r--lib/Target/X86/X86CondBrFolding.cpp2
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp20
-rwxr-xr-xlib/Target/X86/X86EvexToVex.cpp2
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp11
-rw-r--r--lib/Target/X86/X86FastISel.cpp15
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp68
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp201
-rw-r--r--lib/Target/X86/X86FixupSetCC.cpp4
-rw-r--r--lib/Target/X86/X86FlagsCopyLowering.cpp13
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp6
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp106
-rw-r--r--lib/Target/X86/X86FrameLowering.h4
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp304
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp5926
-rw-r--r--lib/Target/X86/X86ISelLowering.h77
-rw-r--r--lib/Target/X86/X86IndirectBranchTracking.cpp2
-rw-r--r--lib/Target/X86/X86InsertPrefetch.cpp8
-rw-r--r--lib/Target/X86/X86InstrAVX512.td1457
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td10
-rw-r--r--lib/Target/X86/X86InstrBuilder.h6
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td33
-rw-r--r--lib/Target/X86/X86InstrCompiler.td139
-rw-r--r--lib/Target/X86/X86InstrControl.td85
-rw-r--r--lib/Target/X86/X86InstrExtension.td11
-rw-r--r--lib/Target/X86/X86InstrFoldTables.cpp287
-rw-r--r--lib/Target/X86/X86InstrFoldTables.h39
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td26
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp582
-rw-r--r--lib/Target/X86/X86InstrInfo.h28
-rw-r--r--lib/Target/X86/X86InstrInfo.td57
-rw-r--r--lib/Target/X86/X86InstrMMX.td33
-rw-r--r--lib/Target/X86/X86InstrMPX.td32
-rw-r--r--lib/Target/X86/X86InstrSSE.td551
-rw-r--r--lib/Target/X86/X86InstrSystem.td2
-rw-r--r--lib/Target/X86/X86InstrTSX.td2
-rw-r--r--lib/Target/X86/X86InstrXOP.td26
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp135
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h6
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp20
-rw-r--r--lib/Target/X86/X86LegalizerInfo.h3
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp313
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h8
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp60
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.cpp4
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp31
-rw-r--r--lib/Target/X86/X86RetpolineThunks.cpp8
-rwxr-xr-xlib/Target/X86/X86SchedBroadwell.td8
-rw-r--r--lib/Target/X86/X86SchedHaswell.td8
-rw-r--r--lib/Target/X86/X86SchedPredicates.td57
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td8
-rw-r--r--lib/Target/X86/X86SchedSkylakeClient.td8
-rwxr-xr-xlib/Target/X86/X86SchedSkylakeServer.td8
-rw-r--r--lib/Target/X86/X86Schedule.td24
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td6
-rw-r--r--lib/Target/X86/X86ScheduleBdVer2.td6
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td257
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td8
-rw-r--r--lib/Target/X86/X86ScheduleZnver1.td8
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp2
-rw-r--r--lib/Target/X86/X86SpeculativeLoadHardening.cpp59
-rw-r--r--lib/Target/X86/X86Subtarget.cpp18
-rw-r--r--lib/Target/X86/X86Subtarget.h23
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp49
-rw-r--r--lib/Target/X86/X86TargetMachine.h2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp4
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h3
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp225
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h11
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp6
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp4
-rw-r--r--lib/Target/X86/X86WinEHState.cpp5
95 files changed, 7513 insertions, 4674 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 95cbf46d37ed..25be79ec2b1e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -870,6 +870,14 @@ private:
bool parseDirectiveFPOEndProc(SMLoc L);
bool parseDirectiveFPOData(SMLoc L);
+ /// SEH directives.
+ bool parseSEHRegisterNumber(unsigned RegClassID, unsigned &RegNo);
+ bool parseDirectiveSEHPushReg(SMLoc);
+ bool parseDirectiveSEHSetFrame(SMLoc);
+ bool parseDirectiveSEHSaveReg(SMLoc);
+ bool parseDirectiveSEHSaveXMM(SMLoc);
+ bool parseDirectiveSEHPushFrame(SMLoc);
+
unsigned checkTargetMatchPredicate(MCInst &Inst) override;
bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
@@ -955,6 +963,8 @@ private:
public:
enum X86MatchResultTy {
Match_Unsupported = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "X86GenAsmMatcher.inc"
};
X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
@@ -3173,6 +3183,13 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
EmitInstruction(Inst, Operands, Out);
Opcode = Inst.getOpcode();
return false;
+ case Match_InvalidImmUnsignedi4: {
+ SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+ EmptyRange, MatchingInlineAsm);
+ }
case Match_MissingFeature:
return ErrorMissingFeature(IDLoc, MissingFeatures, MatchingInlineAsm);
case Match_InvalidOperand:
@@ -3520,6 +3537,15 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
MatchingInlineAsm);
}
+ if (std::count(std::begin(Match), std::end(Match),
+ Match_InvalidImmUnsignedi4) == 1) {
+ SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc();
+ if (ErrorLoc == SMLoc())
+ ErrorLoc = IDLoc;
+ return Error(ErrorLoc, "immediate must be an integer in range [0, 15]",
+ EmptyRange, MatchingInlineAsm);
+ }
+
// If all of these were an outright failure, report it in a useless way.
return Error(IDLoc, "unknown instruction mnemonic", EmptyRange,
MatchingInlineAsm);
@@ -3572,6 +3598,16 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
else if (IDVal == ".cv_fpo_endproc")
return parseDirectiveFPOEndProc(DirectiveID.getLoc());
+ else if (IDVal == ".seh_pushreg")
+ return parseDirectiveSEHPushReg(DirectiveID.getLoc());
+ else if (IDVal == ".seh_setframe")
+ return parseDirectiveSEHSetFrame(DirectiveID.getLoc());
+ else if (IDVal == ".seh_savereg")
+ return parseDirectiveSEHSaveReg(DirectiveID.getLoc());
+ else if (IDVal == ".seh_savexmm")
+ return parseDirectiveSEHSaveXMM(DirectiveID.getLoc());
+ else if (IDVal == ".seh_pushframe")
+ return parseDirectiveSEHPushFrame(DirectiveID.getLoc());
return true;
}
@@ -3708,6 +3744,140 @@ bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
return getTargetStreamer().emitFPOEndProc(L);
}
+bool X86AsmParser::parseSEHRegisterNumber(unsigned RegClassID,
+ unsigned &RegNo) {
+ SMLoc startLoc = getLexer().getLoc();
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ // Try parsing the argument as a register first.
+ if (getLexer().getTok().isNot(AsmToken::Integer)) {
+ SMLoc endLoc;
+ if (ParseRegister(RegNo, startLoc, endLoc))
+ return true;
+
+ if (!X86MCRegisterClasses[RegClassID].contains(RegNo)) {
+ return Error(startLoc,
+ "register is not supported for use with this directive");
+ }
+ } else {
+ // Otherwise, an integer number matching the encoding of the desired
+ // register may appear.
+ int64_t EncodedReg;
+ if (getParser().parseAbsoluteExpression(EncodedReg))
+ return true;
+
+ // The SEH register number is the same as the encoding register number. Map
+ // from the encoding back to the LLVM register number.
+ RegNo = 0;
+ for (MCPhysReg Reg : X86MCRegisterClasses[RegClassID]) {
+ if (MRI->getEncodingValue(Reg) == EncodedReg) {
+ RegNo = Reg;
+ break;
+ }
+ }
+ if (RegNo == 0) {
+ return Error(startLoc,
+ "incorrect register number for use with this directive");
+ }
+ }
+
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushReg(SMLoc Loc) {
+ unsigned Reg = 0;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFIPushReg(Reg, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSetFrame(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify a stack pointer offset");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISetFrame(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveReg(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::GR64RegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify an offset on the stack");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISaveReg(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHSaveXMM(SMLoc Loc) {
+ unsigned Reg = 0;
+ int64_t Off;
+ if (parseSEHRegisterNumber(X86::VR128XRegClassID, Reg))
+ return true;
+ if (getLexer().isNot(AsmToken::Comma))
+ return TokError("you must specify an offset on the stack");
+
+ getParser().Lex();
+ if (getParser().parseAbsoluteExpression(Off))
+ return true;
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFISaveXMM(Reg, Off, Loc);
+ return false;
+}
+
+bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
+ bool Code = false;
+ StringRef CodeID;
+ if (getLexer().is(AsmToken::At)) {
+ SMLoc startLoc = getLexer().getLoc();
+ getParser().Lex();
+ if (!getParser().parseIdentifier(CodeID)) {
+ if (CodeID != "code")
+ return Error(startLoc, "expected @code");
+ Code = true;
+ }
+ }
+
+ if (getLexer().isNot(AsmToken::EndOfStatement))
+ return TokError("unexpected token in directive");
+
+ getParser().Lex();
+ getStreamer().EmitWinCFIPushFrame(Code, Loc);
+ return false;
+}
+
// Force static initialization.
extern "C" void LLVMInitializeX86AsmParser() {
RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index 5bc979d1f18c..e9be28ca77b0 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
@@ -35,6 +35,10 @@ inline bool isImmUnsignedi8Value(uint64_t Value) {
return isUInt<8>(Value) || isInt<8>(Value);
}
+inline bool isImmUnsignedi4Value(uint64_t Value) {
+ return isUInt<4>(Value);
+}
+
} // End of namespace llvm
#endif
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index a771ba366318..3a76d023e640 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -260,6 +260,15 @@ struct X86Operand final : public MCParsedAsmOperand {
return isImmSExti64i32Value(CE->getValue());
}
+ bool isImmUnsignedi4() const {
+ if (!isImm()) return false;
+ // If this isn't a constant expr, reject it. The immediate byte is shared
+ // with a register encoding. We can't have it affected by a relocation.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE) return false;
+ return isImmUnsignedi4Value(CE->getValue());
+ }
+
bool isImmUnsignedi8() const {
if (!isImm()) return false;
// If this isn't a constant expr, just assume it fits and let relaxation
@@ -491,7 +500,7 @@ struct X86Operand final : public MCParsedAsmOperand {
void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
assert(N == 1 && "Invalid number of operands!");
- unsigned RegNo = getReg();
+ MCRegister RegNo = getReg();
if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
RegNo = getX86SubSuperRegister(RegNo, 32);
Inst.addOperand(MCOperand::createReg(RegNo));
@@ -572,7 +581,7 @@ struct X86Operand final : public MCParsedAsmOperand {
static std::unique_ptr<X86Operand> CreateToken(StringRef Str, SMLoc Loc) {
SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
- auto Res = llvm::make_unique<X86Operand>(Token, Loc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Token, Loc, EndLoc);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
return Res;
@@ -582,7 +591,7 @@ struct X86Operand final : public MCParsedAsmOperand {
CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
bool AddressOf = false, SMLoc OffsetOfLoc = SMLoc(),
StringRef SymName = StringRef(), void *OpDecl = nullptr) {
- auto Res = llvm::make_unique<X86Operand>(Register, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Register, StartLoc, EndLoc);
Res->Reg.RegNo = RegNo;
Res->AddressOf = AddressOf;
Res->OffsetOfLoc = OffsetOfLoc;
@@ -593,19 +602,19 @@ struct X86Operand final : public MCParsedAsmOperand {
static std::unique_ptr<X86Operand>
CreateDXReg(SMLoc StartLoc, SMLoc EndLoc) {
- return llvm::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
+ return std::make_unique<X86Operand>(DXRegister, StartLoc, EndLoc);
}
static std::unique_ptr<X86Operand>
CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
- auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
Res->Pref.Prefixes = Prefixes;
return Res;
}
static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
SMLoc StartLoc, SMLoc EndLoc) {
- auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
Res->Imm.Val = Val;
return Res;
}
@@ -615,7 +624,7 @@ struct X86Operand final : public MCParsedAsmOperand {
CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
unsigned Size = 0, StringRef SymName = StringRef(),
void *OpDecl = nullptr, unsigned FrontendSize = 0) {
- auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = 0;
@@ -643,7 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand {
// The scale should always be one of {1,2,4,8}.
assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
"Invalid scale!");
- auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
+ auto Res = std::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = SegReg;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = BaseReg;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index a241362a271d..e287f6625115 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -12,13 +12,14 @@
//
//===----------------------------------------------------------------------===//
+#include "X86DisassemblerDecoder.h"
+#include "llvm/ADT/StringRef.h"
+
#include <cstdarg> /* for va_*() */
#include <cstdio> /* for vsnprintf() */
#include <cstdlib> /* for exit() */
#include <cstring> /* for memset() */
-#include "X86DisassemblerDecoder.h"
-
using namespace llvm::X86Disassembler;
/// Specifies whether a ModR/M byte is needed and (if so) which
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 54413fa1a02f..f08fcb575bf0 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -287,7 +287,7 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const {
// Relax if the value is too big for a (signed) i8.
- return int64_t(Value) != int64_t(int8_t(Value));
+ return !isInt<8>(Value);
}
// FIXME: Can tblgen help at all here to verify there aren't other instructions
@@ -557,7 +557,7 @@ protected:
// If the frame pointer is other than esp/rsp, we do not have a way to
// generate a compact unwinding representation, so bail out.
- if (MRI.getLLVMRegNum(Inst.getRegister(), true) !=
+ if (*MRI.getLLVMRegNum(Inst.getRegister(), true) !=
(Is64Bit ? X86::RBP : X86::EBP))
return 0;
@@ -605,7 +605,7 @@ protected:
// unwind encoding.
return CU::UNWIND_MODE_DWARF;
- unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+ unsigned Reg = *MRI.getLLVMRegNum(Inst.getRegister(), true);
SavedRegs[SavedRegIdx++] = Reg;
StackAdjust += OffsetSize;
InstrOffset += PushInstrSize(Reg);
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 232a06593238..bd009da60851 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -46,10 +46,10 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
enum X86_64RelType { RT64_NONE, RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
-static X86_64RelType getType64(unsigned Kind,
+static X86_64RelType getType64(MCFixupKind Kind,
MCSymbolRefExpr::VariantKind &Modifier,
bool &IsPCRel) {
- switch (Kind) {
+ switch (unsigned(Kind)) {
default:
llvm_unreachable("Unimplemented");
case FK_NONE:
@@ -97,7 +97,7 @@ static void checkIs32(MCContext &Ctx, SMLoc Loc, X86_64RelType Type) {
static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
MCSymbolRefExpr::VariantKind Modifier,
X86_64RelType Type, bool IsPCRel,
- unsigned Kind) {
+ MCFixupKind Kind) {
switch (Modifier) {
default:
llvm_unreachable("Unimplemented");
@@ -202,7 +202,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
// and we want to keep back-compatibility.
if (!Ctx.getAsmInfo()->canRelaxRelocations())
return ELF::R_X86_64_GOTPCREL;
- switch (Kind) {
+ switch (unsigned(Kind)) {
default:
return ELF::R_X86_64_GOTPCREL;
case X86::reloc_riprel_4byte_relax:
@@ -237,7 +237,7 @@ static X86_32RelType getType32(X86_64RelType T) {
static unsigned getRelocType32(MCContext &Ctx,
MCSymbolRefExpr::VariantKind Modifier,
X86_32RelType Type, bool IsPCRel,
- unsigned Kind) {
+ MCFixupKind Kind) {
switch (Modifier) {
default:
llvm_unreachable("Unimplemented");
@@ -265,8 +265,9 @@ static unsigned getRelocType32(MCContext &Ctx,
if (!Ctx.getAsmInfo()->canRelaxRelocations())
return ELF::R_386_GOT32;
- return Kind == X86::reloc_signed_4byte_relax ? ELF::R_386_GOT32X
- : ELF::R_386_GOT32;
+ return Kind == MCFixupKind(X86::reloc_signed_4byte_relax)
+ ? ELF::R_386_GOT32X
+ : ELF::R_386_GOT32;
case MCSymbolRefExpr::VK_GOTOFF:
assert(Type == RT32_32);
assert(!IsPCRel);
@@ -317,7 +318,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
- unsigned Kind = Fixup.getKind();
+ MCFixupKind Kind = Fixup.getKind();
X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
if (getEMachine() == ELF::EM_X86_64)
return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
@@ -329,5 +330,5 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
std::unique_ptr<MCObjectTargetWriter>
llvm::createX86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) {
- return llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
+ return std::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index e1125c176b25..d986c829d98e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -163,5 +163,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
TextAlignFillValue = 0x90;
+ AllowAtInName = true;
+
UseIntegratedAssembler = true;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 31d26d08a63f..ac36bf3a12fa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -862,6 +862,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
VEX_B = ~(BaseRegEnc >> 3) & 1;
unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
+ if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
+ EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
+
break;
}
case X86II::MRMSrcReg: {
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ce05ad974507..ced9eacc8b97 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -70,6 +70,10 @@ unsigned X86_MC::getDwarfRegFlavour(const Triple &TT, bool isEH) {
return DWARFFlavour::X86_32_Generic;
}
+bool X86_MC::hasLockPrefix(const MCInst &MI) {
+ return MI.getFlags() & X86::IP_HAS_LOCK;
+}
+
void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
// FIXME: TableGen these.
for (unsigned Reg = X86::NoRegister + 1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
@@ -399,6 +403,9 @@ public:
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
uint64_t GotSectionVA,
const Triple &TargetTriple) const override;
+ Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
+ uint64_t Addr,
+ uint64_t Size) const override;
};
#define GET_STIPREDICATE_DEFS_FOR_MC_ANALYSIS
@@ -511,7 +518,31 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
return findX86_64PltEntries(PltSectionVA, PltContents);
default:
return {};
- }
+ }
+}
+
+Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
+ const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
+ const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
+ int MemOpStart = X86II::getMemoryOperandNo(MCID.TSFlags);
+ if (MemOpStart == -1)
+ return None;
+ MemOpStart += X86II::getOperandBias(MCID);
+
+ const MCOperand &SegReg = Inst.getOperand(MemOpStart + X86::AddrSegmentReg);
+ const MCOperand &BaseReg = Inst.getOperand(MemOpStart + X86::AddrBaseReg);
+ const MCOperand &IndexReg = Inst.getOperand(MemOpStart + X86::AddrIndexReg);
+ const MCOperand &ScaleAmt = Inst.getOperand(MemOpStart + X86::AddrScaleAmt);
+ const MCOperand &Disp = Inst.getOperand(MemOpStart + X86::AddrDisp);
+ if (SegReg.getReg() != 0 || IndexReg.getReg() != 0 || ScaleAmt.getImm() != 1 ||
+ !Disp.isImm())
+ return None;
+
+ // RIP-relative addressing.
+ if (BaseReg.getReg() == X86::RIP)
+ return Addr + Size + Disp.getImm();
+
+ return None;
}
} // end of namespace X86_MC
@@ -567,13 +598,13 @@ extern "C" void LLVMInitializeX86TargetMC() {
createX86_64AsmBackend);
}
-unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
- bool High) {
+MCRegister llvm::getX86SubSuperRegisterOrZero(MCRegister Reg, unsigned Size,
+ bool High) {
switch (Size) {
- default: return 0;
+ default: return X86::NoRegister;
case 8:
if (High) {
- switch (Reg) {
+ switch (Reg.id()) {
default: return getX86SubSuperRegisterOrZero(Reg, 64);
case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
return X86::SI;
@@ -593,8 +624,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
return X86::BH;
}
} else {
- switch (Reg) {
- default: return 0;
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::AL;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -630,8 +661,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
}
}
case 16:
- switch (Reg) {
- default: return 0;
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::AX;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -666,8 +697,8 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
return X86::R15W;
}
case 32:
- switch (Reg) {
- default: return 0;
+ switch (Reg.id()) {
+ default: return X86::NoRegister;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::EAX;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -702,7 +733,7 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
return X86::R15D;
}
case 64:
- switch (Reg) {
+ switch (Reg.id()) {
default: return 0;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::RAX;
@@ -740,9 +771,9 @@ unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size,
}
}
-unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) {
- unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
- assert(Res != 0 && "Unexpected register or VT");
+MCRegister llvm::getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High) {
+ MCRegister Res = getX86SubSuperRegisterOrZero(Reg, Size, High);
+ assert(Res != X86::NoRegister && "Unexpected register or VT");
return Res;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 00dd5908cbf5..0c789061f0e1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/DataTypes.h"
#include <string>
@@ -57,6 +58,10 @@ unsigned getDwarfRegFlavour(const Triple &TT, bool isEH);
void initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI);
+
+/// Returns true if this instruction has a LOCK prefix.
+bool hasLockPrefix(const MCInst &MI);
+
/// Create a X86 MCSubtargetInfo instance. This is exposed so Asm parser, etc.
/// do not need to go through TargetRegistry.
MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU,
@@ -111,12 +116,12 @@ createX86WinCOFFObjectWriter(bool Is64Bit);
/// Returns the sub or super register of a specific X86 register.
/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
/// Aborts on error.
-unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false);
+MCRegister getX86SubSuperRegister(MCRegister, unsigned, bool High=false);
/// Returns the sub or super register of a specific X86 register.
/// Like getX86SubSuperRegister() but returns 0 on error.
-unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned,
- bool High = false);
+MCRegister getX86SubSuperRegisterOrZero(MCRegister, unsigned,
+ bool High = false);
} // End llvm namespace
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index fc7e99f61e5e..b67a7508fe72 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -276,7 +276,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
// x86_64 distinguishes movq foo@GOTPCREL so that the linker can
// rewrite the movq to an leaq at link time if the symbol ends up in
// the same linkage unit.
- if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+ if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load)
Type = MachO::X86_64_RELOC_GOT_LOAD;
else
Type = MachO::X86_64_RELOC_GOT;
@@ -339,8 +339,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
return;
} else {
Type = MachO::X86_64_RELOC_UNSIGNED;
- unsigned Kind = Fixup.getKind();
- if (Kind == X86::reloc_signed_4byte) {
+ if (Fixup.getTargetKind() == X86::reloc_signed_4byte) {
Asm.getContext().reportError(
Fixup.getLoc(),
"32-bit absolute addressing is not supported in 64-bit mode");
@@ -600,5 +599,5 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
std::unique_ptr<MCObjectTargetWriter>
llvm::createX86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
uint32_t CPUSubtype) {
- return llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
+ return std::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 3baab9da1c41..760239f76505 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -109,5 +109,5 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createX86WinCOFFObjectWriter(bool Is64Bit) {
- return llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
+ return std::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 796a27a17255..db624378d517 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -35,8 +35,9 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
MCStreamer::EmitWinEHHandlerData(Loc);
// We have to emit the unwind info now, because this directive
- // actually switches to the .xdata section!
- EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+ // actually switches to the .xdata section.
+ if (WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo())
+ EHStreamer.EmitUnwindInfo(*this, CurFrame);
}
void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index e9987d1f62bd..d5494ef12370 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -170,7 +170,7 @@ bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
L, "opening new .cv_fpo_proc before closing previous frame");
return true;
}
- CurFPOData = llvm::make_unique<FPOData>();
+ CurFPOData = std::make_unique<FPOData>();
CurFPOData->Function = ProcSym;
CurFPOData->Begin = emitFPOLabel();
CurFPOData->ParamsSize = ParamsSize;
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index a95f68434d12..6840fc12751d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -81,6 +81,12 @@ FunctionPass *createX86FlagsCopyLoweringPass();
/// Return a pass that expands WinAlloca pseudo-instructions.
FunctionPass *createX86WinAllocaExpander();
+/// Return a pass that inserts int3 at the end of the function if it ends with a
+/// CALL instruction. The pass does the same for each funclet as well. This
+/// ensures that the open interval of function start and end PCs contains all
+/// return addresses for the benefit of the Windows x64 unwinder.
+FunctionPass *createX86AvoidTrailingCallPass();
+
/// Return a pass that optimizes the code-size of x86 call sequences. This is
/// done by replacing esp-relative movs with pushes.
FunctionPass *createX86CallFrameOptimization();
@@ -137,13 +143,13 @@ void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExpandPseudoPass(PassRegistry&);
void initializeX86CondBrFoldingPassPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
+void initializeX86ExpandPseudoPass(PassRegistry &);
void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3112f00c91f2..d8631aca2734 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -95,7 +95,8 @@ def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
"Support 64-bit instructions">;
def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
- "64-bit with cmpxchg16b">;
+ "64-bit with cmpxchg16b",
+ [FeatureCMPXCHG8B]>;
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
@@ -240,8 +241,11 @@ def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true",
"Enable Cache Demote">;
def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
"Support ptwrite instruction">;
-def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true",
- "Support MPX instructions">;
+// FIXME: This feature is deprecated in 10.0 and should not be used for
+// anything, but removing it would break IR files that may contain it in a
+// target-feature attribute.
+def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
+ "Deprecated. Support MPX instructions">;
def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
@@ -374,6 +378,10 @@ def FeatureHasFastGather
: SubtargetFeature<"fast-gather", "HasFastGather", "true",
"Indicates if gather is reasonably fast">;
+def FeaturePrefer128Bit
+ : SubtargetFeature<"prefer-128-bit", "Prefer128Bit", "true",
+ "Prefer 128-bit AVX instructions">;
+
def FeaturePrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
"Prefer 256-bit AVX instructions">;
@@ -449,6 +457,10 @@ def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"Merge branches to a three-way "
"conditional branch">;
+// Enable use of alias analysis during code generation.
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+ "Use alias analysis during codegen">;
+
// Bonnell
def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
// Silvermont
@@ -579,7 +591,6 @@ def ProcessorFeatures {
// Skylake
list<SubtargetFeature> SKLAdditionalFeatures = [FeatureAES,
- FeatureMPX,
FeatureXSAVEC,
FeatureXSAVES,
FeatureCLFLUSHOPT,
@@ -594,6 +605,7 @@ def ProcessorFeatures {
// Skylake-AVX512
list<SubtargetFeature> SKXAdditionalFeatures = [FeatureAVX512,
+ FeaturePrefer256Bit,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -627,6 +639,7 @@ def ProcessorFeatures {
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
+ FeaturePrefer256Bit,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -665,6 +678,17 @@ def ProcessorFeatures {
list<SubtargetFeature> ICXFeatures =
!listconcat(ICLInheritableFeatures, ICXSpecificFeatures);
+ //Tigerlake
+ list<SubtargetFeature> TGLAdditionalFeatures = [FeatureVP2INTERSECT,
+ FeatureMOVDIRI,
+ FeatureMOVDIR64B,
+ FeatureSHSTK];
+ list<SubtargetFeature> TGLSpecificFeatures = [FeatureHasFastGather];
+ list<SubtargetFeature> TGLInheritableFeatures =
+ !listconcat(TGLAdditionalFeatures ,TGLSpecificFeatures);
+ list<SubtargetFeature> TGLFeatures =
+ !listconcat(ICLFeatures, TGLInheritableFeatures );
+
// Atom
list<SubtargetFeature> AtomInheritableFeatures = [FeatureX87,
FeatureCMPXCHG8B,
@@ -707,7 +731,6 @@ def ProcessorFeatures {
// Goldmont
list<SubtargetFeature> GLMAdditionalFeatures = [FeatureAES,
- FeatureMPX,
FeatureSHA,
FeatureRDSEED,
FeatureXSAVE,
@@ -786,6 +809,22 @@ def ProcessorFeatures {
list<SubtargetFeature> KNMFeatures =
!listconcat(KNLFeatures, [FeatureVPOPCNTDQ]);
+ // Barcelona
+ list<SubtargetFeature> BarcelonaInheritableFeatures = [FeatureX87,
+ FeatureCMPXCHG8B,
+ FeatureSSE4A,
+ Feature3DNowA,
+ FeatureFXSR,
+ FeatureNOPL,
+ FeatureCMPXCHG16B,
+ FeatureLZCNT,
+ FeaturePOPCNT,
+ FeatureSlowSHLD,
+ FeatureLAHFSAHF,
+ FeatureCMOV,
+ Feature64Bit,
+ FeatureFastScalarShiftMasks];
+ list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
// Bobcat
list<SubtargetFeature> BtVer1InheritableFeatures = [FeatureX87,
@@ -1093,6 +1132,8 @@ def : ProcessorModel<"icelake-client", SkylakeServerModel,
ProcessorFeatures.ICLFeatures>;
def : ProcessorModel<"icelake-server", SkylakeServerModel,
ProcessorFeatures.ICXFeatures>;
+def : ProcessorModel<"tigerlake", SkylakeServerModel,
+ ProcessorFeatures.TGLFeatures>;
// AMD CPUs.
@@ -1129,10 +1170,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
}
foreach P = ["amdfam10", "barcelona"] in {
- def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE4A, Feature3DNowA,
- FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowSHLD, FeatureLAHFSAHF, FeatureCMOV,
- Feature64Bit, FeatureFastScalarShiftMasks]>;
+ def : Proc<P, ProcessorFeatures.BarcelonaFeatures>;
}
// Bobcat
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 80120722e0e6..8d27be30a277 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -242,7 +242,7 @@ void X86AsmPrinter::PrintModifiedOperand(const MachineInstr *MI, unsigned OpNo,
return PrintOperand(MI, OpNo, O);
if (MI->getInlineAsmDialect() == InlineAsm::AD_ATT)
O << '%';
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
(strcmp(Modifier+6,"32") == 0) ? 32 :
@@ -388,7 +388,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
char Mode, raw_ostream &O) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
bool EmitPercent = true;
if (!X86::GR8RegClass.contains(Reg) &&
@@ -575,7 +575,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
// Emitting note header.
int WordSize = TT.isArch64Bit() ? 8 : 4;
- EmitAlignment(WordSize == 4 ? 2 : 3);
+ EmitAlignment(WordSize == 4 ? Align(4) : Align(8));
OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
@@ -585,7 +585,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
OutStreamer->EmitIntValue(4, 4); // data size
OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data
- EmitAlignment(WordSize == 4 ? 2 : 3); // padding
+ EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
OutStreamer->endSection(Nt);
OutStreamer->SwitchSection(Cur);
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 3dcc1015dc7c..69c6b3356cbb 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -35,6 +35,7 @@
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -390,7 +391,7 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
- unsigned Reg1 = MRI->createVirtualRegister(
+ Register Reg1 = MRI->createVirtualRegister(
TII->getRegClass(TII->get(NLoadOpcode), 0, TRI, *(MBB->getParent())));
MachineInstr *NewLoad =
BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
diff --git a/lib/Target/X86/X86AvoidTrailingCall.cpp b/lib/Target/X86/X86AvoidTrailingCall.cpp
new file mode 100644
index 000000000000..fb4f9e2901dc
--- /dev/null
+++ b/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -0,0 +1,108 @@
+//===----- X86AvoidTrailingCall.cpp - Insert int3 after trailing calls ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The Windows x64 unwinder has trouble unwinding the stack when a return
+// address points to the end of the function. This pass maintains the invariant
+// that every return address is inside the bounds of its parent function or
+// funclet by inserting int3 if the last instruction would otherwise be a call.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+#define DEBUG_TYPE "x86-avoid-trailing-call"
+
+using namespace llvm;
+
+namespace {
+
+class X86AvoidTrailingCallPass : public MachineFunctionPass {
+public:
+ X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ StringRef getPassName() const override {
+ return "X86 avoid trailing call pass";
+ }
+ static char ID;
+};
+
+char X86AvoidTrailingCallPass::ID = 0;
+
+} // end anonymous namespace
+
+FunctionPass *llvm::createX86AvoidTrailingCallPass() {
+ return new X86AvoidTrailingCallPass();
+}
+
+// A real instruction is a non-meta, non-pseudo instruction. Some pseudos
+// expand to nothing, and some expand to code. This logic conservatively assumes
+// they might expand to nothing.
+static bool isRealInstruction(MachineInstr &MI) {
+ return !MI.isPseudo() && !MI.isMetaInstruction();
+}
+
+// Return true if this is a call instruction, but not a tail call.
+static bool isCallInstruction(const MachineInstr &MI) {
+ return MI.isCall() && !MI.isReturn();
+}
+
+bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86InstrInfo &TII = *STI.getInstrInfo();
+ assert(STI.isTargetWin64() && "pass only runs on Win64");
+
+ // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops
+ // before epilogues.
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ // Look for basic blocks that precede funclet entries or are at the end of
+ // the function.
+ MachineBasicBlock *NextMBB = MBB.getNextNode();
+ if (NextMBB && !NextMBB->isEHFuncletEntry())
+ continue;
+
+ // Find the last real instruction in this block, or previous blocks if this
+ // block is empty.
+ MachineBasicBlock::reverse_iterator LastRealInstr;
+ for (MachineBasicBlock &RMBB :
+ make_range(MBB.getReverseIterator(), MF.rend())) {
+ LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction);
+ if (LastRealInstr != RMBB.rend())
+ break;
+ }
+
+ // Do nothing if this function or funclet has no instructions.
+ if (LastRealInstr == MF.begin()->rend())
+ continue;
+
+ // If this is a call instruction, insert int3 right after it with the same
+ // DebugLoc. Convert back to a forward iterator and advance the insertion
+ // position once.
+ if (isCallInstruction(*LastRealInstr)) {
+ LLVM_DEBUG({
+ dbgs() << "inserting int3 after trailing call instruction:\n";
+ LastRealInstr->dump();
+ dbgs() << '\n';
+ });
+
+ MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse());
+ BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(),
+ TII.get(X86::INT3));
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 4df849a2e14c..ad7e32b4efc8 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -155,12 +155,22 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// This is bad, and breaks SP adjustment.
// So, check that all of the frames in the function are closed inside
// the same block, and, for good measure, that there are no nested frames.
+ //
+ // If any call allocates more argument stack memory than the stack
+ // probe size, don't do this optimization. Otherwise, this pass
+ // would need to synthesize additional stack probe calls to allocate
+ // memory for arguments.
unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+ bool UseStackProbe =
+ !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty();
+ unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
for (MachineBasicBlock &BB : MF) {
bool InsideFrameSequence = false;
for (MachineInstr &MI : BB) {
if (MI.getOpcode() == FrameSetupOpcode) {
+ if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe)
+ return false;
if (InsideFrameSequence)
return false;
InsideFrameSequence = true;
@@ -325,8 +335,8 @@ X86CallFrameOptimization::classifyInstruction(
for (const MachineOperand &MO : MI->operands()) {
if (!MO.isReg())
continue;
- unsigned int Reg = MO.getReg();
- if (!RegInfo.isPhysicalRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isPhysicalRegister(Reg))
continue;
if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister()))
return Exit;
@@ -370,7 +380,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
while (I->getOpcode() == X86::LEA32r || I->isDebugInstr())
++I;
- unsigned StackPtr = RegInfo.getStackRegister();
+ Register StackPtr = RegInfo.getStackRegister();
auto StackPtrCopyInst = MBB.end();
// SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
// register. If it's there, use that virtual register as stack pointer
@@ -443,8 +453,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
for (const MachineOperand &MO : I->uses()) {
if (!MO.isReg())
continue;
- unsigned int Reg = MO.getReg();
- if (RegInfo.isPhysicalRegister(Reg))
+ Register Reg = MO.getReg();
+ if (Register::isPhysicalRegister(Reg))
UsedRegs.insert(Reg);
}
}
@@ -524,12 +534,12 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
break;
case X86::MOV32mr:
case X86::MOV64mr: {
- unsigned int Reg = PushOp.getReg();
+ Register Reg = PushOp.getReg();
// If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
// in preparation for the PUSH64. The upper 32 bits can be undef.
if (Is64Bit && Store->getOpcode() == X86::MOV32mr) {
- unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
@@ -598,7 +608,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
// movl %eax, (%esp)
// call
// Get rid of those with prejudice.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return nullptr;
// Make sure this is the only use of Reg.
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index b16b3839c85a..7ee637cfd523 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -102,6 +102,8 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
DL(MIRBuilder.getMF().getDataLayout()),
STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
+ bool isIncomingArgumentHandler() const override { return false; }
+
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
@@ -155,8 +157,9 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
- const CallLowering::ArgInfo &Info, CCState &State) override {
- bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags,
+ CCState &State) override {
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
StackSize = State.getNextStackOffset();
static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
@@ -229,7 +232,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
: ValueHandler(MIRBuilder, MRI, AssignFn),
DL(MIRBuilder.getMF().getDataLayout()) {}
- bool isArgumentHandler() const override { return true; }
+ bool isIncomingArgumentHandler() const override { return true; }
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -237,7 +240,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- unsigned AddrReg = MRI.createGenericVirtualRegister(
+ Register AddrReg = MRI.createGenericVirtualRegister(
LLT::pointer(0, DL.getPointerSizeInBits(0)));
MIRBuilder.buildFrameIndex(AddrReg, FI);
return AddrReg;
@@ -301,6 +304,7 @@ struct FormalArgHandler : public IncomingValueHandler {
: IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMRI()->addLiveIn(PhysReg);
MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -372,10 +376,7 @@ bool X86CallLowering::lowerFormalArguments(
}
bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
- CallingConv::ID CallConv,
- const MachineOperand &Callee,
- const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const {
+ CallLoweringInfo &Info) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -385,8 +386,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
auto TRI = STI.getRegisterInfo();
// Handle only Linux C, X86_64_SysV calling conventions for now.
- if (!STI.isTargetLinux() ||
- !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV))
+ if (!STI.isTargetLinux() || !(Info.CallConv == CallingConv::C ||
+ Info.CallConv == CallingConv::X86_64_SysV))
return false;
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
@@ -395,18 +396,19 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
bool Is64Bit = STI.is64Bit();
- unsigned CallOpc = Callee.isReg()
+ unsigned CallOpc = Info.Callee.isReg()
? (Is64Bit ? X86::CALL64r : X86::CALL32r)
: (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
- auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask(
- TRI->getCallPreservedMask(MF, CallConv));
+ auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc)
+ .add(Info.Callee)
+ .addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
SmallVector<ArgInfo, 8> SplitArgs;
- for (const auto &OrigArg : OrigArgs) {
+ for (const auto &OrigArg : Info.OrigArgs) {
// TODO: handle not simple cases.
- if (OrigArg.Flags.isByVal())
+ if (OrigArg.Flags[0].isByVal())
return false;
if (OrigArg.Regs.size() > 1)
@@ -423,8 +425,8 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
- bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed;
- if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) {
+ bool IsFixed = Info.OrigArgs.empty() ? true : Info.OrigArgs.back().IsFixed;
+ if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(Info.CallConv)) {
// From AMD64 ABI document:
// For calls that may call functions that use varargs or stdargs
// (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -445,23 +447,24 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
- if (Callee.isReg())
+ if (Info.Callee.isReg())
MIB->getOperand(0).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
- *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Callee, 0));
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), Info.Callee,
+ 0));
// Finally we can copy the returned value back into its virtual-register. In
// symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
- if (!OrigRet.Ty->isVoidTy()) {
- if (OrigRet.Regs.size() > 1)
+ if (!Info.OrigRet.Ty->isVoidTy()) {
+ if (Info.OrigRet.Regs.size() > 1)
return false;
SplitArgs.clear();
SmallVector<Register, 8> NewRegs;
- if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
+ if (!splitToValueTypes(Info.OrigRet, SplitArgs, DL, MRI,
[&](ArrayRef<Register> Regs) {
NewRegs.assign(Regs.begin(), Regs.end());
}))
@@ -472,7 +475,7 @@ bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
return false;
if (!NewRegs.empty())
- MIRBuilder.buildMerge(OrigRet.Regs[0], NewRegs);
+ MIRBuilder.buildMerge(Info.OrigRet.Regs[0], NewRegs);
}
CallSeqStart.addImm(Handler.getStackSize())
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index 0445331bc3ff..444a0c7d0122 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -34,9 +34,8 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
- const MachineOperand &Callee, const ArgInfo &OrigRet,
- ArrayRef<ArgInfo> OrigArgs) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const override;
private:
/// A function of this type is used to perform value split action.
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 1c3034a5116a..4c49d68bec99 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -433,6 +433,7 @@ defm X86_SysV64_RegCall :
def RetCC_X86_32 : CallingConv<[
// If FastCC, use RetCC_X86_32_Fast.
CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
// If HiPE, use RetCC_X86_32_HiPE.
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
@@ -1000,6 +1001,7 @@ def CC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+ CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index a61fa3246f09..5123853f5455 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -436,8 +436,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
// Checks for "isUse()" as "uses()" returns also implicit definitions.
if (!MO.isReg() || !MO.isUse())
continue;
- unsigned Reg = MO.getReg();
- auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+ Register Reg = MO.getReg();
+ auto &RDM = RegDefMaps[Register::isVirtualRegister(Reg)];
if (MachineInstr *DefMI = RDM.lookup(Reg)) {
OperandToDefMap[&MO] = DefMI;
DepthInfo Info = DepthMap.lookup(DefMI);
@@ -456,8 +456,8 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
for (auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.isDef())
continue;
- unsigned Reg = MO.getReg();
- RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+ Register Reg = MO.getReg();
+ RegDefMaps[Register::isVirtualRegister(Reg)][Reg] = &MI;
}
unsigned Latency = TSchedModel.computeInstrLatency(&MI);
@@ -710,7 +710,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Skip any CMOVs in this group which don't load from memory.
if (!MI.mayLoad()) {
// Remember the false-side register input.
- unsigned FalseReg =
+ Register FalseReg =
MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
// Walk back through any intermediate cmovs referenced.
while (true) {
@@ -753,7 +753,7 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
// Get a fresh register to use as the destination of the MOV.
const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
- unsigned TmpReg = MRI->createVirtualRegister(RC);
+ Register TmpReg = MRI->createVirtualRegister(RC);
SmallVector<MachineInstr *, 4> NewMIs;
bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
@@ -810,9 +810,9 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
- unsigned DestReg = MIIt->getOperand(0).getReg();
- unsigned Op1Reg = MIIt->getOperand(1).getReg();
- unsigned Op2Reg = MIIt->getOperand(2).getReg();
+ Register DestReg = MIIt->getOperand(0).getReg();
+ Register Op1Reg = MIIt->getOperand(1).getReg();
+ Register Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are processing is the opposite condition from the jump we
// generated, then we have to swap the operands for the PHI that is going to
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 9dea94f1368d..1bf2d5ba7b8f 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -564,7 +564,7 @@ X86CondBrFolding::analyzeMBB(MachineBasicBlock &MBB) {
Modified = false;
break;
}
- return llvm::make_unique<TargetMBBInfo>(TargetMBBInfo{
+ return std::make_unique<TargetMBBInfo>(TargetMBBInfo{
TBB, FBB, BrInstr, CmpInstr, CC, SrcReg, CmpValue, Modified, CmpBrOnly});
}
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index 18bbfa32e11b..b4cf5cafbc6e 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -182,7 +182,7 @@ public:
MachineBasicBlock *MBB = MI->getParent();
auto &DL = MI->getDebugLoc();
- unsigned Reg = MRI->createVirtualRegister(
+ Register Reg = MRI->createVirtualRegister(
TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
*MBB->getParent()));
MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
@@ -219,13 +219,13 @@ public:
// Don't allow copies to/flow GR8/GR16 physical registers.
// FIXME: Is there some better way to support this?
- unsigned DstReg = MI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+ Register DstReg = MI->getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg) &&
(X86::GR8RegClass.contains(DstReg) ||
X86::GR16RegClass.contains(DstReg)))
return false;
- unsigned SrcReg = MI->getOperand(1).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ Register SrcReg = MI->getOperand(1).getReg();
+ if (Register::isPhysicalRegister(SrcReg) &&
(X86::GR8RegClass.contains(SrcReg) ||
X86::GR16RegClass.contains(SrcReg)))
return false;
@@ -241,7 +241,7 @@ public:
// Physical registers will not be converted. Assume that converting the
// COPY to the destination domain will eventually result in a actual
// instruction.
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
return 1;
RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()),
@@ -436,7 +436,7 @@ void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
if (EnclosedEdges.count(Reg))
return;
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return;
if (!MRI->hasOneDef(Reg))
@@ -593,8 +593,8 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
if (!DefOp.isReg())
continue;
- unsigned DefReg = DefOp.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(DefReg)) {
+ Register DefReg = DefOp.getReg();
+ if (!Register::isVirtualRegister(DefReg)) {
C.setAllIllegal();
continue;
}
@@ -751,7 +751,7 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
// Go over all virtual registers and calculate a closure.
unsigned ClosureID = 0;
for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);
+ unsigned Reg = Register::index2VirtReg(Idx);
// GPR only current source domain supported.
if (!isGPR(MRI->getRegClass(Reg)))
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 58680f1815bb..24c8e6d6f6eb 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -131,7 +131,7 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) &&
"ZMM instructions should not be in the EVEX->VEX tables");
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index b8624b40f2f7..9126a1fbea52 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -194,7 +194,8 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::TCRETURNmi64: {
bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
MachineOperand &JumpTarget = MBBI->getOperand(0);
- MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands
+ : 1);
assert(StackAdjust.isImm() && "Expecting immediate value.");
// Adjust stack pointer.
@@ -259,7 +260,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
? X86::TAILJMPm
: (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
- for (unsigned i = 0; i != 5; ++i)
+ for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
MIB.add(MBBI->getOperand(i));
} else if (Opcode == X86::TCRETURNri64) {
JumpTarget.setIsKill();
@@ -274,7 +275,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineInstr &NewMI = *std::prev(MBBI);
NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
- MBB.getParent()->updateCallSiteInfo(&*MBBI, &NewMI);
+ MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
@@ -287,7 +288,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
assert(DestAddr.isReg() && "Offset should be in register!");
const bool Uses64BitFramePtr =
STI->isTarget64BitLP64() || STI->isTargetNaCl64();
- unsigned StackPtr = TRI->getStackRegister();
+ Register StackPtr = TRI->getStackRegister();
BuildMI(MBB, MBBI, DL,
TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
.addReg(DestAddr.getReg());
@@ -347,7 +348,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// actualcmpxchg Addr
// [E|R]BX = SaveRbx
const MachineOperand &InArg = MBBI->getOperand(6);
- unsigned SaveRbx = MBBI->getOperand(7).getReg();
+ Register SaveRbx = MBBI->getOperand(7).getReg();
unsigned ActualInArg =
Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 7b9ce0271205..e5e089d07d55 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1160,6 +1160,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::C &&
CC != CallingConv::Fast &&
+ CC != CallingConv::Tail &&
CC != CallingConv::X86_FastCall &&
CC != CallingConv::X86_StdCall &&
CC != CallingConv::X86_ThisCall &&
@@ -1173,7 +1174,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
return false;
// Let SDISel handle vararg functions.
@@ -1241,7 +1243,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
}
// Make the copy.
- unsigned DstReg = VA.getLocReg();
+ Register DstReg = VA.getLocReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
// Avoid a cross-class copy. This is very unlikely.
if (!SrcRC->contains(DstReg))
@@ -3157,7 +3159,7 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
if (Subtarget->getTargetTriple().isOSMSVCRT())
return 0;
if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE)
+ CC == CallingConv::HiPE || CC == CallingConv::Tail)
return 0;
if (CS)
@@ -3208,6 +3210,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
default: return false;
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Tail:
case CallingConv::WebKit_JS:
case CallingConv::Swift:
case CallingConv::X86_FastCall:
@@ -3224,7 +3227,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
- if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+ if ((CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) ||
+ CC == CallingConv::Tail)
return false;
// Don't know how to handle Win64 varargs yet. Nothing special needed for
@@ -3387,6 +3391,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
case CCValAssign::SExtUpper:
case CCValAssign::ZExtUpper:
case CCValAssign::FPExt:
+ case CCValAssign::Trunc:
llvm_unreachable("Unexpected loc info!");
case CCValAssign::Indirect:
// FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
@@ -3547,7 +3552,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
CCValAssign &VA = RVLocs[i];
EVT CopyVT = VA.getValVT();
unsigned CopyReg = ResultReg + i;
- unsigned SrcReg = VA.getLocReg();
+ Register SrcReg = VA.getLocReg();
// If this is x86-64, and we disabled SSE, we can't return FP values
if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index bf541d933790..9f7c4afde760 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -80,7 +80,7 @@ class FixupBWInstPass : public MachineFunctionPass {
/// destination register of the MachineInstr passed in. It returns true if
/// that super register is dead just prior to \p OrigMI, and false if not.
bool getSuperRegDestIfDead(MachineInstr *OrigMI,
- unsigned &SuperDestReg) const;
+ Register &SuperDestReg) const;
/// Change the MachineInstr \p MI into the equivalent extending load to 32 bit
/// register if it is safe to do so. Return the replacement instruction if
@@ -92,6 +92,12 @@ class FixupBWInstPass : public MachineFunctionPass {
/// nullptr.
MachineInstr *tryReplaceCopy(MachineInstr *MI) const;
+ /// Change the MachineInstr \p MI into the equivalent extend to 32 bit
+ /// register if it is safe to do so. Return the replacement instruction if
+ /// OK, otherwise return nullptr.
+ MachineInstr *tryReplaceExtend(unsigned New32BitOpcode,
+ MachineInstr *MI) const;
+
// Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
// possible. Return the replacement instruction if OK, return nullptr
// otherwise.
@@ -169,10 +175,10 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
///
/// If so, return that super register in \p SuperDestReg.
bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
- unsigned &SuperDestReg) const {
+ Register &SuperDestReg) const {
auto *TRI = &TII->getRegisterInfo();
- unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
+ Register OrigDestReg = OrigMI->getOperand(0).getReg();
SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
const auto SubRegIdx = TRI->getSubRegIndex(SuperDestReg, OrigDestReg);
@@ -232,12 +238,12 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
// %ax = KILL %ax, implicit killed %eax
// RET 0, %ax
unsigned Opc = OrigMI->getOpcode(); (void)Opc;
- // These are the opcodes currently handled by the pass, if something
- // else will be added we need to ensure that new opcode has the same
- // properties.
- assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
- Opc == X86::MOV16rr) &&
- "Unexpected opcode.");
+ // These are the opcodes currently known to work with the code below, if
+ // something // else will be added we need to ensure that new opcode has the
+ // same properties.
+ if (Opc != X86::MOV8rm && Opc != X86::MOV16rm && Opc != X86::MOV8rr &&
+ Opc != X86::MOV16rr)
+ return false;
bool IsDefined = false;
for (auto &MO: OrigMI->implicit_operands()) {
@@ -247,7 +253,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
if (MO.isDef() && TRI->isSuperRegisterEq(OrigDestReg, MO.getReg()))
- IsDefined = true;
+ IsDefined = true;
// If MO is a use of any part of the destination register but is not equal
// to OrigDestReg or one of its subregisters, we cannot use SuperDestReg.
@@ -268,7 +274,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
MachineInstr *MI) const {
- unsigned NewDestReg;
+ Register NewDestReg;
// We are going to try to rewrite this load to a larger zero-extending
// load. This is safe if all portions of the 32 bit super-register
@@ -295,11 +301,11 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
auto &OldDest = MI->getOperand(0);
auto &OldSrc = MI->getOperand(1);
- unsigned NewDestReg;
+ Register NewDestReg;
if (!getSuperRegDestIfDead(MI, NewDestReg))
return nullptr;
- unsigned NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
+ Register NewSrcReg = getX86SubSuperRegister(OldSrc.getReg(), 32);
// This is only correct if we access the same subregister index: otherwise,
// we could try to replace "movb %ah, %al" with "movl %eax, %eax".
@@ -326,6 +332,33 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
return MIB;
}
+MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
+ MachineInstr *MI) const {
+ Register NewDestReg;
+ if (!getSuperRegDestIfDead(MI, NewDestReg))
+ return nullptr;
+
+ // Don't interfere with formation of CBW instructions which should be a
+ // shorter encoding than even the MOVSX32rr8. It's also immunte to partial
+ // merge issues on Intel CPUs.
+ if (MI->getOpcode() == X86::MOVSX16rr8 &&
+ MI->getOperand(0).getReg() == X86::AX &&
+ MI->getOperand(1).getReg() == X86::AL)
+ return nullptr;
+
+ // Safe to change the instruction.
+ MachineInstrBuilder MIB =
+ BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i)
+ MIB.add(MI->getOperand(i));
+
+ MIB.setMemRefs(MI->memoperands());
+
+ return MIB;
+}
+
MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
MachineBasicBlock &MBB) const {
// See if this is an instruction of the type we are currently looking for.
@@ -355,6 +388,15 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
// of the register.
return tryReplaceCopy(MI);
+ case X86::MOVSX16rr8:
+ return tryReplaceExtend(X86::MOVSX32rr8, MI);
+ case X86::MOVSX16rm8:
+ return tryReplaceExtend(X86::MOVSX32rm8, MI);
+ case X86::MOVZX16rr8:
+ return tryReplaceExtend(X86::MOVZX32rr8, MI);
+ case X86::MOVZX16rm8:
+ return tryReplaceExtend(X86::MOVZX32rm8, MI);
+
default:
// nothing to do here.
break;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 041529a0be68..543dc8b00fa0 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -67,8 +67,8 @@ class FixupLEAPass : public MachineFunctionPass {
/// - LEA that uses RIP relative addressing mode
/// - LEA that uses 16-bit addressing mode "
/// This function currently handles the first 2 cases only.
- MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineBasicBlock &MBB);
+ void processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB, bool OptIncDec);
/// Look for LEAs that are really two address LEAs that we might be able to
/// turn into regular ADD instructions.
@@ -216,14 +216,10 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
continue;
- if (IsSlowLEA) {
+ if (IsSlowLEA)
processInstructionForSlowLEA(I, MBB);
- } else if (IsSlow3OpsLEA) {
- if (auto *NewMI = processInstrForSlow3OpLEA(*I, MBB)) {
- MBB.erase(I);
- I = NewMI;
- }
- }
+ else if (IsSlow3OpsLEA)
+ processInstrForSlow3OpLEA(I, MBB, OptIncDec);
}
// Second pass for creating LEAs. This may reverse some of the
@@ -301,18 +297,14 @@ static inline bool isInefficientLEAReg(unsigned Reg) {
Reg == X86::R13D || Reg == X86::R13;
}
-static inline bool isRegOperand(const MachineOperand &Op) {
- return Op.isReg() && Op.getReg() != X86::NoRegister;
-}
-
/// Returns true if this LEA uses base an index registers, and the base register
/// is known to be inefficient for the subtarget.
// TODO: use a variant scheduling class to model the latency profile
// of LEA instructions, and implement this logic as a scheduling predicate.
static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
const MachineOperand &Index) {
- return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
- isRegOperand(Index);
+ return Base.isReg() && isInefficientLEAReg(Base.getReg()) && Index.isReg() &&
+ Index.getReg() != X86::NoRegister;
}
static inline bool hasLEAOffset(const MachineOperand &Offset) {
@@ -372,9 +364,9 @@ bool FixupLEAPass::optTwoAddrLEA(MachineBasicBlock::iterator &I,
!TII->isSafeToClobberEFLAGS(MBB, I))
return false;
- unsigned DestReg = MI.getOperand(0).getReg();
- unsigned BaseReg = Base.getReg();
- unsigned IndexReg = Index.getReg();
+ Register DestReg = MI.getOperand(0).getReg();
+ Register BaseReg = Base.getReg();
+ Register IndexReg = Index.getReg();
// Don't change stack adjustment LEAs.
if (UseLEAForSP && (DestReg == X86::ESP || DestReg == X86::RSP))
@@ -500,9 +492,9 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
if (Segment.getReg() != 0 || !Offset.isImm() ||
!TII->isSafeToClobberEFLAGS(MBB, I))
return;
- const unsigned DstR = Dst.getReg();
- const unsigned SrcR1 = Base.getReg();
- const unsigned SrcR2 = Index.getReg();
+ const Register DstR = Dst.getReg();
+ const Register SrcR1 = Base.getReg();
+ const Register SrcR2 = Index.getReg();
if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
return;
if (Scale.getImm() > 1)
@@ -534,111 +526,150 @@ void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
}
}
-MachineInstr *
-FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
- MachineBasicBlock &MBB) {
+void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
+ MachineBasicBlock &MBB,
+ bool OptIncDec) {
+ MachineInstr &MI = *I;
const unsigned LEAOpcode = MI.getOpcode();
- const MachineOperand &Dst = MI.getOperand(0);
+ const MachineOperand &Dest = MI.getOperand(0);
const MachineOperand &Base = MI.getOperand(1 + X86::AddrBaseReg);
const MachineOperand &Scale = MI.getOperand(1 + X86::AddrScaleAmt);
const MachineOperand &Index = MI.getOperand(1 + X86::AddrIndexReg);
const MachineOperand &Offset = MI.getOperand(1 + X86::AddrDisp);
const MachineOperand &Segment = MI.getOperand(1 + X86::AddrSegmentReg);
- if (!(TII->isThreeOperandsLEA(MI) ||
- hasInefficientLEABaseReg(Base, Index)) ||
+ if (!(TII->isThreeOperandsLEA(MI) || hasInefficientLEABaseReg(Base, Index)) ||
!TII->isSafeToClobberEFLAGS(MBB, MI) ||
Segment.getReg() != X86::NoRegister)
- return nullptr;
+ return;
+
+ Register DestReg = Dest.getReg();
+ Register BaseReg = Base.getReg();
+ Register IndexReg = Index.getReg();
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ if (BaseReg != 0)
+ BaseReg = TRI->getSubReg(BaseReg, X86::sub_32bit);
+ if (IndexReg != 0)
+ IndexReg = TRI->getSubReg(IndexReg, X86::sub_32bit);
+ }
- unsigned DstR = Dst.getReg();
- unsigned BaseR = Base.getReg();
- unsigned IndexR = Index.getReg();
- unsigned SSDstR =
- (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
bool IsScale1 = Scale.getImm() == 1;
- bool IsInefficientBase = isInefficientLEAReg(BaseR);
- bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+ bool IsInefficientBase = isInefficientLEAReg(BaseReg);
+ bool IsInefficientIndex = isInefficientLEAReg(IndexReg);
// Skip these cases since it takes more than 2 instructions
// to replace the LEA instruction.
- if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
- return nullptr;
- if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
- (IsInefficientIndex || !IsScale1))
- return nullptr;
-
- const DebugLoc DL = MI.getDebugLoc();
- const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
- const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+ if (IsInefficientBase && DestReg == BaseReg && !IsScale1)
+ return;
LLVM_DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
LLVM_DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+ MachineInstr *NewMI = nullptr;
+
// First try to replace LEA with one or two (for the 3-op LEA case)
// add instructions:
// 1.lea (%base,%index,1), %base => add %index,%base
// 2.lea (%base,%index,1), %index => add %base,%index
- if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
- const MachineOperand &Src = DstR == BaseR ? Index : Base;
- MachineInstr *NewMI =
- BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
- LLVM_DEBUG(NewMI->dump(););
- // Create ADD instruction for the Offset in case of 3-Ops LEA.
- if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- LLVM_DEBUG(NewMI->dump(););
+ if (IsScale1 && (DestReg == BaseReg || DestReg == IndexReg)) {
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ if (DestReg != BaseReg)
+ std::swap(BaseReg, IndexReg);
+
+ if (MI.getOpcode() == X86::LEA64_32r) {
+ // TODO: Do we need the super register implicit use?
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(BaseReg)
+ .addReg(IndexReg)
+ .addReg(Base.getReg(), RegState::Implicit)
+ .addReg(Index.getReg(), RegState::Implicit);
+ } else {
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(BaseReg)
+ .addReg(IndexReg);
}
- return NewMI;
- }
- // If the base is inefficient try switching the index and base operands,
- // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
- // lea offset(%base,%index,scale),%dst =>
- // lea (%base,%index,scale); add offset,%dst
- if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
- MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
- .add(Dst)
- .add(IsInefficientBase ? Index : Base)
- .add(Scale)
- .add(IsInefficientBase ? Base : Index)
- .addImm(0)
- .add(Segment);
+ } else if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+ // If the base is inefficient try switching the index and base operands,
+ // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+ // lea offset(%base,%index,scale),%dst =>
+ // lea (%base,%index,scale); add offset,%dst
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+ .add(Dest)
+ .add(IsInefficientBase ? Index : Base)
+ .add(Scale)
+ .add(IsInefficientBase ? Base : Index)
+ .addImm(0)
+ .add(Segment);
LLVM_DEBUG(NewMI->dump(););
+ }
+
+ // If either replacement succeeded above, add the offset if needed, then
+ // replace the instruction.
+ if (NewMI) {
// Create ADD instruction for the Offset in case of 3-Ops LEA.
if (hasLEAOffset(Offset)) {
- NewMI = BuildMI(MBB, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
- LLVM_DEBUG(NewMI->dump(););
+ if (OptIncDec && Offset.isImm() &&
+ (Offset.getImm() == 1 || Offset.getImm() == -1)) {
+ unsigned NewOpc =
+ getINCDECFromLEA(MI.getOpcode(), Offset.getImm() == 1);
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg);
+ LLVM_DEBUG(NewMI->dump(););
+ } else {
+ unsigned NewOpc = getADDriFromLEA(MI.getOpcode(), Offset);
+ NewMI = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Offset);
+ LLVM_DEBUG(NewMI->dump(););
+ }
}
- return NewMI;
+
+ MBB.erase(I);
+ I = NewMI;
+ return;
}
+
// Handle the rest of the cases with inefficient base register:
- assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+ assert(DestReg != BaseReg && "DestReg == BaseReg should be handled already!");
assert(IsInefficientBase && "efficient base should be handled already!");
+ // FIXME: Handle LEA64_32r.
+ if (LEAOpcode == X86::LEA64_32r)
+ return;
+
// lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
if (IsScale1 && !hasLEAOffset(Offset)) {
- bool BIK = Base.isKill() && BaseR != IndexR;
- TII->copyPhysReg(MBB, MI, DL, DstR, BaseR, BIK);
+ bool BIK = Base.isKill() && BaseReg != IndexReg;
+ TII->copyPhysReg(MBB, MI, MI.getDebugLoc(), DestReg, BaseReg, BIK);
LLVM_DEBUG(MI.getPrevNode()->dump(););
- MachineInstr *NewMI =
- BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Index);
LLVM_DEBUG(NewMI->dump(););
- return NewMI;
+ return;
}
+
// lea offset(%base,%index,scale), %dst =>
// lea offset( ,%index,scale), %dst; add %base,%dst
- MachineInstr *NewMI = BuildMI(MBB, MI, DL, TII->get(LEAOpcode))
- .add(Dst)
- .addReg(0)
- .add(Scale)
- .add(Index)
- .add(Offset)
- .add(Segment);
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(LEAOpcode))
+ .add(Dest)
+ .addReg(0)
+ .add(Scale)
+ .add(Index)
+ .add(Offset)
+ .add(Segment);
LLVM_DEBUG(NewMI->dump(););
- NewMI = BuildMI(MBB, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+ unsigned NewOpc = getADDrrFromLEA(MI.getOpcode());
+ NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), DestReg)
+ .addReg(DestReg)
+ .add(Base);
LLVM_DEBUG(NewMI->dump(););
- return NewMI;
+
+ MBB.erase(I);
+ I = NewMI;
}
diff --git a/lib/Target/X86/X86FixupSetCC.cpp b/lib/Target/X86/X86FixupSetCC.cpp
index e2d4d1ede6f3..cbde280aa280 100644
--- a/lib/Target/X86/X86FixupSetCC.cpp
+++ b/lib/Target/X86/X86FixupSetCC.cpp
@@ -136,8 +136,8 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *RC = MF.getSubtarget<X86Subtarget>().is64Bit()
? &X86::GR32RegClass
: &X86::GR32_ABCDRegClass;
- unsigned ZeroReg = MRI->createVirtualRegister(RC);
- unsigned InsertReg = MRI->createVirtualRegister(RC);
+ Register ZeroReg = MRI->createVirtualRegister(RC);
+ Register InsertReg = MRI->createVirtualRegister(RC);
// Initialize a register with 0. This must go before the eflags def
BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0),
diff --git a/lib/Target/X86/X86FlagsCopyLowering.cpp b/lib/Target/X86/X86FlagsCopyLowering.cpp
index 5ce3255ea96a..cfba06fb6533 100644
--- a/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -721,8 +721,9 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
for (MachineInstr &MI :
llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
X86::CondCode Cond = X86::getCondFromSETCC(MI);
- if (Cond != X86::COND_INVALID && !MI.mayStore() && MI.getOperand(0).isReg() &&
- TRI->isVirtualRegister(MI.getOperand(0).getReg())) {
+ if (Cond != X86::COND_INVALID && !MI.mayStore() &&
+ MI.getOperand(0).isReg() &&
+ Register::isVirtualRegister(MI.getOperand(0).getReg())) {
assert(MI.getOperand(0).isDef() &&
"A non-storing SETcc should always define a register!");
CondRegs[Cond] = MI.getOperand(0).getReg();
@@ -739,7 +740,7 @@ CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
unsigned X86FlagsCopyLoweringPass::promoteCondToReg(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, X86::CondCode Cond) {
- unsigned Reg = MRI->createVirtualRegister(PromoteRC);
+ Register Reg = MRI->createVirtualRegister(PromoteRC);
auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
TII->get(X86::SETCCr), Reg).addImm(Cond);
(void)SetI;
@@ -813,7 +814,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
MachineBasicBlock &MBB = *MI.getParent();
// Insert an instruction that will set the flag back to the desired value.
- unsigned TmpReg = MRI->createVirtualRegister(PromoteRC);
+ Register TmpReg = MRI->createVirtualRegister(PromoteRC);
auto AddI =
BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(), TII->get(X86::ADD8ri))
.addDef(TmpReg, RegState::Dead)
@@ -974,7 +975,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
// Now we need to turn this into a bitmask. We do this by subtracting it from
// zero.
- unsigned ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
ZeroReg = AdjustReg(ZeroReg);
@@ -999,7 +1000,7 @@ void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
default:
llvm_unreachable("Invalid SETB_C* opcode!");
}
- unsigned ResultReg = MRI->createVirtualRegister(&SetBRC);
+ Register ResultReg = MRI->createVirtualRegister(&SetBRC);
BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
.addReg(ZeroReg)
.addReg(ExtCondReg);
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 074cf21d03f5..fcfb5bc91314 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -288,8 +288,8 @@ namespace {
// Check if a COPY instruction is using FP registers.
static bool isFPCopy(MachineInstr &MI) {
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
return X86::RFP80RegClass.contains(DstReg) ||
X86::RFP80RegClass.contains(SrcReg);
@@ -313,7 +313,7 @@ FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
/// For example, this returns 3 for X86::FP3.
static unsigned getFPReg(const MachineOperand &MO) {
assert(MO.isReg() && "Expected an FP register!");
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
return Reg - X86::FP0;
}
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index e310fe069117..1b469a814adc 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -35,8 +35,8 @@
using namespace llvm;
X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
- unsigned StackAlignOverride)
- : TargetFrameLowering(StackGrowsDown, StackAlignOverride,
+ MaybeAlign StackAlignOverride)
+ : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
STI.is64Bit() ? -8 : -4),
STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
// Cache a bunch of frame-related predicates for this subtarget.
@@ -176,7 +176,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
MachineOperand &MO = MBBI->getOperand(i);
if (!MO.isReg() || MO.isDef())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg)
continue;
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -216,7 +216,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg != X86::EFLAGS)
continue;
@@ -995,11 +995,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
bool NeedsDwarfCFI =
!IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry());
- unsigned FramePtr = TRI->getFrameRegister(MF);
- const unsigned MachineFramePtr =
+ Register FramePtr = TRI->getFrameRegister(MF);
+ const Register MachineFramePtr =
STI.isTarget64BitILP32()
- ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
- unsigned BasePtr = TRI->getBaseRegister();
+ ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
+ Register BasePtr = TRI->getBaseRegister();
bool HasWinCFI = false;
// Debug location must be unknown since the first debug location is used
@@ -1016,14 +1016,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
-
- // The default stack probe size is 4096 if the function has no stackprobesize
- // attribute.
- unsigned StackProbeSize = 4096;
- if (Fn.hasFnAttribute("stack-probe-size"))
- Fn.getFnAttribute("stack-probe-size")
- .getValueAsString()
- .getAsInteger(0, StackProbeSize);
+ unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
// Re-align the stack on 64-bit if the x86-interrupt calling convention is
// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
@@ -1081,7 +1074,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
int stackGrowth = -SlotSize;
// Find the funclet establisher parameter
- unsigned Establisher = X86::NoRegister;
+ Register Establisher = X86::NoRegister;
if (IsClrFunclet)
Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX;
else if (IsFunclet)
@@ -1192,7 +1185,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
(MBBI->getOpcode() == X86::PUSH32r ||
MBBI->getOpcode() == X86::PUSH64r)) {
PushedRegs = true;
- unsigned Reg = MBBI->getOperand(0).getReg();
+ Register Reg = MBBI->getOperand(0).getReg();
++MBBI;
if (!HasFP && NeedsDwarfCFI) {
@@ -1396,9 +1389,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
int FI;
if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
if (X86::FR64RegClass.contains(Reg)) {
+ int Offset;
unsigned IgnoredFrameReg;
- int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
- Offset += SEHFrameOffset;
+ if (IsWin64Prologue && IsFunclet)
+ Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
+ else
+ Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
+ SEHFrameOffset;
HasWinCFI = true;
assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1554,9 +1551,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
unsigned
X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
// This is the size of the pushed CSRs.
- unsigned CSSize =
- MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ // This is the size of callee saved XMMs.
+ const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ unsigned XMMSize = WinEHXMMSlotInfo.size() *
+ TRI->getSpillSize(X86::VR128RegClass);
// This is the amount of stack a funclet needs to allocate.
unsigned UsedSize;
EHPersonality Personality =
@@ -1576,7 +1577,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
// Subtract out the size of the callee saved registers. This is how much stack
// each funclet will allocate.
- return FrameSizeMinusRBP - CSSize;
+ return FrameSizeMinusRBP + XMMSize - CSSize;
}
static bool isTailCallOpcode(unsigned Opc) {
@@ -1597,9 +1598,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
DL = MBBI->getDebugLoc();
// standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
const bool Is64BitILP32 = STI.isTarget64BitILP32();
- unsigned FramePtr = TRI->getFrameRegister(MF);
+ Register FramePtr = TRI->getFrameRegister(MF);
unsigned MachineFramePtr =
- Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
+ Is64BitILP32 ? Register(getX86SubSuperRegister(FramePtr, 64)) : FramePtr;
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsWin64CFI =
@@ -1850,6 +1851,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return Offset + FPDelta;
}
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
+ int FI, unsigned &FrameReg) const {
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+ const auto it = WinEHXMMSlotInfo.find(FI);
+
+ if (it == WinEHXMMSlotInfo.end())
+ return getFrameIndexReference(MF, FI, FrameReg);
+
+ FrameReg = TRI->getStackRegister();
+ return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+}
+
int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
int FI, unsigned &FrameReg,
int Adjustment) const {
@@ -1948,6 +1963,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned CalleeSavedFrameSize = 0;
+ unsigned XMMCalleeSavedFrameSize = 0;
+ auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -1984,7 +2001,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// Since emitPrologue and emitEpilogue will handle spilling and restoring of
// the frame register, we can delete it from CSI list and not have to worry
// about avoiding it later.
- unsigned FPReg = TRI->getFrameRegister(MF);
+ Register FPReg = TRI->getFrameRegister(MF);
for (unsigned i = 0; i < CSI.size(); ++i) {
if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
CSI.erase(CSI.begin() + i);
@@ -2025,12 +2042,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
unsigned Size = TRI->getSpillSize(*RC);
unsigned Align = TRI->getSpillAlignment(*RC);
// ensure alignment
- SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
+ assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
+ SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);
+
// spill into slot
SpillSlotOffset -= Size;
int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
CSI[i - 1].setFrameIdx(SlotIndex);
MFI.ensureMaxAlignment(Align);
+
+ // Save the start offset and size of XMM in stack frame for funclets.
+ if (X86::VR128RegClass.contains(Reg)) {
+ WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
+ XMMCalleeSavedFrameSize += Size;
+ }
}
return true;
@@ -2200,7 +2225,7 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Spill the BasePtr if it's used.
if (TRI->hasBasePointer(MF)){
- unsigned BasePtr = TRI->getBaseRegister();
+ Register BasePtr = TRI->getBaseRegister();
if (STI.isTarget64BitILP32())
BasePtr = getX86SubSuperRegister(BasePtr, 64);
SavedRegs.set(BasePtr);
@@ -2212,7 +2237,7 @@ HasNestArgument(const MachineFunction *MF) {
const Function &F = MF->getFunction();
for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
I != E; I++) {
- if (I->hasNestAttr())
+ if (I->hasNestAttr() && !I->use_empty())
return true;
}
return false;
@@ -2244,7 +2269,8 @@ GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Pr
bool IsNested = HasNestArgument(&MF);
if (CallingConvention == CallingConv::X86_FastCall ||
- CallingConvention == CallingConv::Fast) {
+ CallingConvention == CallingConv::Fast ||
+ CallingConvention == CallingConv::Tail) {
if (IsNested)
report_fatal_error("Segmented stacks does not support fastcall with "
"nested function.");
@@ -2525,6 +2551,18 @@ static unsigned getHiPELiteral(
+ " required but not provided");
}
+// Return true if there are no non-ehpad successors to MBB and there are no
+// non-meta instructions between MBBI and MBB.end().
+static bool blockEndIsUnreachable(const MachineBasicBlock &MBB,
+ MachineBasicBlock::const_iterator MBBI) {
+ return std::all_of(
+ MBB.succ_begin(), MBB.succ_end(),
+ [](const MachineBasicBlock *Succ) { return Succ->isEHPad(); }) &&
+ std::all_of(MBBI, MBB.end(), [](const MachineInstr &MI) {
+ return MI.isMetaInstruction();
+ });
+}
+
/// Erlang programs may need a special prologue to handle the stack size they
/// might need at runtime. That is because Erlang/OTP does not implement a C
/// stack but uses a custom implementation of hybrid stack/heap architecture.
@@ -2758,7 +2796,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
unsigned Opcode = I->getOpcode();
bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
DebugLoc DL = I->getDebugLoc();
- uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0;
+ uint64_t Amount = TII.getFrameSize(*I);
uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
I = MBB.erase(I);
auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
@@ -2847,7 +2885,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
return I;
}
- if (isDestroy && InternalAmt) {
+ if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) {
// If we are performing frame pointer elimination and if the callee pops
// something off the stack pointer, add it back. We do this until we have
// more advanced stack pointer tracking ability.
@@ -2912,8 +2950,8 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
"restoring EBP/ESI on non-32-bit target");
MachineFunction &MF = *MBB.getParent();
- unsigned FramePtr = TRI->getFrameRegister(MF);
- unsigned BasePtr = TRI->getBaseRegister();
+ Register FramePtr = TRI->getFrameRegister(MF);
+ Register BasePtr = TRI->getBaseRegister();
WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index d32746e3a36e..2103d6471ead 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -25,7 +25,7 @@ class X86RegisterInfo;
class X86FrameLowering : public TargetFrameLowering {
public:
- X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride);
+ X86FrameLowering(const X86Subtarget &STI, MaybeAlign StackAlignOverride);
// Cached subtarget predicates.
@@ -99,6 +99,8 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const override;
+ int getWin64EHFrameIndexRef(const MachineFunction &MF,
+ int FI, unsigned &SPReg) const;
int getFrameIndexReferenceSP(const MachineFunction &MF,
int FI, unsigned &SPReg, int Adjustment) const;
int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 95d31e62cafc..5b546d42d98a 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -253,6 +253,11 @@ namespace {
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
+ bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@@ -362,6 +367,11 @@ namespace {
if (User->getNumOperands() != 2)
continue;
+ // If this can match to INC/DEC, don't count it as a use.
+ if (User->getOpcode() == ISD::ADD &&
+ (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
+ continue;
+
// Immediates that are used for offsets as part of stack
// manipulation should be left alone. These are typically
// used to indicate SP offsets for argument passing and
@@ -502,8 +512,10 @@ namespace {
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
+ bool combineIncDecVector(SDNode *Node);
bool tryShrinkShlLogicImm(SDNode *N);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
+ bool tryMatchBitSelect(SDNode *N);
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
@@ -746,7 +758,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
return false;
LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
if (!LD ||
- LD->isVolatile() ||
+ !LD->isSimple() ||
LD->getAddressingMode() != ISD::UNINDEXED ||
LD->getExtensionType() != ISD::NON_EXTLOAD)
return false;
@@ -873,10 +885,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(N);
- SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
- N->getValueType(0),
- N->getOperand(0),
- CurDAG->getConstant(Imm, dl, MVT::i8));
+ SDValue Res = CurDAG->getNode(
+ X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
+ CurDAG->getTargetConstant(Imm, dl, MVT::i8));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
@@ -2305,10 +2316,10 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
return false;
// We can allow a full vector load here since narrowing a load is ok unless
- // it's volatile.
+ // it's volatile or atomic.
if (ISD::isNON_EXTLoad(N.getNode())) {
LoadSDNode *LD = cast<LoadSDNode>(N);
- if (!LD->isVolatile() &&
+ if (LD->isSimple() &&
IsProfitableToFold(N, LD, Root) &&
IsLegalToFold(N, Parent, Root, OptLevel)) {
PatternNodeWithChain = N;
@@ -2464,6 +2475,37 @@ bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
Complexity += 2;
}
+ // Heuristic: try harder to form an LEA from ADD if the operands set flags.
+ // Unlike ADD, LEA does not affect flags, so we will be less likely to require
+ // duplicating flag-producing instructions later in the pipeline.
+ if (N.getOpcode() == ISD::ADD) {
+ auto isMathWithFlags = [](SDValue V) {
+ switch (V.getOpcode()) {
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::ADC:
+ case X86ISD::SBB:
+ /* TODO: These opcodes can be added safely, but we may want to justify
+ their inclusion for different reasons (better for reg-alloc).
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ */
+ // Value 1 is the flag output of the node - verify it's not dead.
+ return !SDValue(V.getNode(), 1).use_empty();
+ default:
+ return false;
+ }
+ };
+ // TODO: This could be an 'or' rather than 'and' to make the transform more
+ // likely to happen. We might want to factor in whether there's a
+ // load folding opportunity for the math op that disappears with LEA.
+ if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
+ Complexity++;
+ }
+
if (AM.Disp)
Complexity++;
@@ -2544,6 +2586,7 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
+ assert(Root && P && "Unknown root/parent nodes");
if (!ISD::isNON_EXTLoad(N.getNode()) ||
!IsProfitableToFold(N, P, Root) ||
!IsLegalToFold(N, P, Root, OptLevel))
@@ -2553,6 +2596,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
+bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ assert(Root && P && "Unknown root/parent nodes");
+ if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -3302,8 +3359,12 @@ bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
SDValue ImplDef = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
- NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
- NBits);
+
+ SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
+ insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
+ NBits = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
+ NBits, SRIdxVal), 0);
insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
if (Subtarget->hasBMI2()) {
@@ -3400,8 +3461,9 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
// hoisting the move immediate would make it worthwhile with a less optimal
// BEXTR?
- if (!Subtarget->hasTBM() &&
- !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
+ bool PreferBEXTR =
+ Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
+ if (!PreferBEXTR && !Subtarget->hasBMI2())
return nullptr;
// Must have a shift right.
@@ -3440,23 +3502,50 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
if (Shift + MaskSize > NVT.getSizeInBits())
return nullptr;
- SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
- unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
- unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
+ // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
+ // does not fit into 32 bits. Load folding is not a sufficient reason.
+ if (!PreferBEXTR && MaskSize <= 32)
+ return nullptr;
- // BMI requires the immediate to placed in a register.
- if (!Subtarget->hasTBM()) {
- ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
- MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ SDValue Control;
+ unsigned ROpc, MOpc;
+
+ if (!PreferBEXTR) {
+ assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
+ // If we can't make use of BEXTR then we can't fuse shift+mask stages.
+ // Let's perform the mask first, and apply shift later. Note that we need to
+ // widen the mask to account for the fact that we'll apply shift afterwards!
+ Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
+ ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
+ MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
- New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ } else {
+ // The 'control' of BEXTR has the pattern of:
+ // [15...8 bit][ 7...0 bit] location
+ // [ bit count][ shift] name
+ // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
+ Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+ if (Subtarget->hasTBM()) {
+ ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+ MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ } else {
+ assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
+ // BMI requires the immediate to placed in a register.
+ ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+ MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+ Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
+ }
}
MachineSDNode *NewNode;
SDValue Input = N0->getOperand(0);
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
- SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+ SDValue Ops[] = {
+ Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
// Update the chain.
@@ -3464,7 +3553,15 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
// Record the mem-refs
CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
} else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
+ }
+
+ if (!PreferBEXTR) {
+ // We still need to apply the shift.
+ SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
+ unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
+ NewNode =
+ CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
}
return NewNode;
@@ -3735,6 +3832,52 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
return true;
}
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
+ assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
+ "Unexpected opcode for increment/decrement transform");
+
+ EVT VT = Node->getValueType(0);
+ assert(VT.isVector() && "Should only be called for vectors.");
+
+ SDValue X = Node->getOperand(0);
+ SDValue OneVec = Node->getOperand(1);
+
+ APInt SplatVal;
+ if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
+ return false;
+
+ SDLoc DL(Node);
+ SDValue OneConstant, AllOnesVec;
+
+ APInt Ones = APInt::getAllOnesValue(32);
+ assert(VT.getSizeInBits() % 32 == 0 &&
+ "Expected bit count to be a multiple of 32");
+ OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
+ insertDAGNode(*CurDAG, X, OneConstant);
+
+ unsigned NumElts = VT.getSizeInBits() / 32;
+ assert(NumElts > 0 && "Expected to get non-empty vector.");
+ AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
+ DL, OneConstant);
+ insertDAGNode(*CurDAG, X, AllOnesVec);
+
+ AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
+ insertDAGNode(*CurDAG, X, AllOnesVec);
+
+ unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
+
+ ReplaceNode(Node, NewNode.getNode());
+ SelectCode(NewNode.getNode());
+ return true;
+}
+
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
@@ -3975,12 +4118,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return false;
- // See if we're comparing against zero. This should have been canonicalized
- // to RHS during lowering.
- if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+ SDValue SetccOp0 = Setcc.getOperand(0);
+ SDValue SetccOp1 = Setcc.getOperand(1);
+
+ // Canonicalize the all zero vector to the RHS.
+ if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
+ std::swap(SetccOp0, SetccOp1);
+
+ // See if we're comparing against zero.
+ if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
return false;
- SDValue N0 = Setcc.getOperand(0);
+ SDValue N0 = SetccOp0;
MVT CmpVT = N0.getSimpleValueType();
MVT CmpSVT = CmpVT.getVectorElementType();
@@ -4027,13 +4176,14 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
// Look through single use bitcasts.
- if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
- Src = Src.getOperand(0);
-
- if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+ if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
Parent = Src.getNode();
Src = Src.getOperand(0);
- if (Src.getSimpleValueType() == CmpSVT)
+ }
+
+ if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
+ if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
return Src;
}
@@ -4045,17 +4195,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
bool FoldedBCast = false;
if (!FoldedLoad && CanFoldLoads &&
(CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
- SDNode *ParentNode = nullptr;
+ SDNode *ParentNode = N0.getNode();
if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
- FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
- Tmp1, Tmp2, Tmp3, Tmp4);
+ FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
}
// Try the other operand.
if (!FoldedBCast) {
+ SDNode *ParentNode = N0.getNode();
if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
- FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
- Tmp1, Tmp2, Tmp3, Tmp4);
+ FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
if (FoldedBCast)
std::swap(Src0, Src1);
}
@@ -4125,7 +4276,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
// Update the chain.
ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
// Record the mem-refs
- CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+ CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
} else {
if (IsMasked)
CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
@@ -4146,6 +4297,55 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
return true;
}
+// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
+// into vpternlog.
+bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
+ assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
+
+ MVT NVT = N->getSimpleValueType(0);
+
+ // Make sure we support VPTERNLOG.
+ if (!NVT.isVector() || !Subtarget->hasAVX512())
+ return false;
+
+ // We need VLX for 128/256-bit.
+ if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+ return false;
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Canonicalize AND to LHS.
+ if (N1.getOpcode() == ISD::AND)
+ std::swap(N0, N1);
+
+ if (N0.getOpcode() != ISD::AND ||
+ N1.getOpcode() != X86ISD::ANDNP ||
+ !N0.hasOneUse() || !N1.hasOneUse())
+ return false;
+
+ // ANDN is not commutable, use it to pick down A and C.
+ SDValue A = N1.getOperand(0);
+ SDValue C = N1.getOperand(1);
+
+ // AND is commutable, if one operand matches A, the other operand is B.
+ // Otherwise this isn't a match.
+ SDValue B;
+ if (N0.getOperand(0) == A)
+ B = N0.getOperand(1);
+ else if (N0.getOperand(1) == A)
+ B = N0.getOperand(0);
+ else
+ return false;
+
+ SDLoc dl(N);
+ SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
+ SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
+ ReplaceNode(N, Ternlog.getNode());
+ SelectCode(Ternlog.getNode());
+ return true;
+}
+
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
@@ -4170,6 +4370,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned Opc = 0;
switch (IntNo) {
+ default: llvm_unreachable("Unexpected intrinsic!");
case Intrinsic::x86_sse3_monitor:
if (!Subtarget->hasSSE3())
break;
@@ -4303,9 +4504,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (tryShrinkShlLogicImm(Node))
return;
+ if (Opcode == ISD::OR && tryMatchBitSelect(Node))
+ return;
+
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::SUB: {
+ if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
+ combineIncDecVector(Node))
+ return;
+
// Try to avoid folding immediates with multiple uses for optsize.
// This code tries to select to register form directly to avoid going
// through the isel table which might fold the immediate. We can't change
@@ -4333,6 +4541,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (!isInt<8>(Val) && !isInt<32>(Val))
break;
+ // If this can match to INC/DEC, let it go.
+ if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
+ break;
+
// Check if we should avoid folding this immediate.
if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
break;
@@ -4610,7 +4822,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i8:
LoReg = X86::AL; ClrReg = HiReg = X86::AH;
- SExtOpcode = X86::CBW;
+ SExtOpcode = 0; // Not used.
break;
case MVT::i16:
LoReg = X86::AX; HiReg = X86::DX;
@@ -4632,24 +4844,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
bool signBitIsZero = CurDAG->SignBitIsZero(N0);
SDValue InFlag;
- if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ if (NVT == MVT::i8) {
// Special case for div8, just use a move with zero extension to AX to
// clear the upper 8 bits (AH).
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
MachineSDNode *Move;
if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
- Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
- MVT::Other, Ops);
+ unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
+ : X86::MOVZX16rm8;
+ Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
Chain = SDValue(Move, 1);
ReplaceUses(N0.getValue(1), Chain);
// Record the mem-refs
CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
} else {
- Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
+ unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
+ : X86::MOVZX16rr8;
+ Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
Chain = CurDAG->getEntryNode();
}
- Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
SDValue());
InFlag = Chain.getValue(1);
} else {
@@ -4996,10 +5211,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(Node);
- SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
- Node->getValueType(0),
+ SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
Node->getOperand(0),
- CurDAG->getConstant(Imm, dl, MVT::i8));
+ CurDAG->getTargetConstant(Imm, dl, MVT::i8));
ReplaceNode(Node, Res.getNode());
SelectCode(Res.getNode());
return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 0b4bf687e6cf..ed975e9248a8 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -65,17 +65,19 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
-static cl::opt<bool> ExperimentalVectorWideningLegalization(
- "x86-experimental-vector-widening-legalization", cl::init(false),
- cl::desc("Enable an experimental vector type legalization through widening "
- "rather than promotion."),
- cl::Hidden);
-
static cl::opt<int> ExperimentalPrefLoopAlignment(
"x86-experimental-pref-loop-alignment", cl::init(4),
- cl::desc("Sets the preferable loop alignment for experiments "
- "(the last x86-experimental-pref-loop-alignment bits"
- " of the loop header PC will be 0)."),
+ cl::desc(
+ "Sets the preferable loop alignment for experiments (as log2 bytes)"
+ "(the last x86-experimental-pref-loop-alignment bits"
+ " of the loop header PC will be 0)."),
+ cl::Hidden);
+
+// Added in 10.0.
+static cl::opt<bool> EnableOldKNLABI(
+ "x86-enable-old-knl-abi", cl::init(false),
+ cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
+ "one ZMM register on AVX512F, but not AVX512BW targets."),
cl::Hidden);
static cl::opt<bool> MulConstantOptimization(
@@ -84,6 +86,13 @@ static cl::opt<bool> MulConstantOptimization(
"SHIFT, LEA, etc."),
cl::Hidden);
+static cl::opt<bool> ExperimentalUnorderedISEL(
+ "x86-experimental-unordered-atomic-isel", cl::init(false),
+ cl::desc("Use LoadSDNode and StoreSDNode instead of "
+ "AtomicSDNode for unordered atomic loads and "
+ "stores respectively."),
+ cl::Hidden);
+
/// Call this when the user attempts to do something unsupported, like
/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
/// report_fatal_error, so calling code should attempt to recover without
@@ -196,7 +205,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Integer absolute.
if (Subtarget.hasCMov()) {
setOperationAction(ISD::ABS , MVT::i16 , Custom);
- setOperationAction(ISD::ABS , MVT::i32 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
}
setOperationAction(ISD::ABS , MVT::i64 , Custom);
@@ -214,14 +223,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
- if (Subtarget.is64Bit()) {
- if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
- // f32/f64 are legal, f80 is custom.
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
- else
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
- } else if (!Subtarget.useSoftFloat()) {
+ if (!Subtarget.useSoftFloat()) {
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
@@ -277,29 +279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
- if (Subtarget.is64Bit()) {
- if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
- // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
- }
- } else if (!Subtarget.useSoftFloat()) {
- // Since AVX is a superset of SSE3, only check for SSE here.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
- // Expand FP_TO_UINT into a select.
- // FIXME: We would like to use a Custom expander here eventually to do
- // the optimal thing for SSE vs. the default expansion in the legalizer.
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
- else
- // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
- // With SSE3 we can use fisttpll to convert to a signed i64; without
- // SSE, we're stuck with a fistpll.
- setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
-
- setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
+ if (!Subtarget.useSoftFloat()) {
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
}
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
@@ -345,11 +327,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
- setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f32 , Expand);
setOperationAction(ISD::FREM , MVT::f64 , Expand);
setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FREM , MVT::f128 , Expand);
setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
// Promote the i8 variants and force them on up to i32 which has a shorter
@@ -396,15 +378,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// There's never any support for operations beyond MVT::f32.
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
@@ -638,17 +624,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
- // Long double always uses X87, except f128 in MMX.
+ // f80 always uses X87.
if (UseX87) {
- if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
- addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
- : &X86::VR128RegClass);
- ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
- setOperationAction(ISD::FABS , MVT::f128, Custom);
- setOperationAction(ISD::FNEG , MVT::f128, Custom);
- setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
- }
-
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -684,10 +661,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLRINT, MVT::f80, Expand);
}
+ // f128 uses xmm registers, but most operations require libcalls.
+ if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
+ addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
+ : &X86::VR128RegClass);
+
+ addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
+
+ setOperationAction(ISD::FADD, MVT::f128, Custom);
+ setOperationAction(ISD::FSUB, MVT::f128, Custom);
+ setOperationAction(ISD::FDIV, MVT::f128, Custom);
+ setOperationAction(ISD::FMUL, MVT::f128, Custom);
+ setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+ setOperationAction(ISD::FABS, MVT::f128, Custom);
+ setOperationAction(ISD::FNEG, MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+
+ setOperationAction(ISD::FSIN, MVT::f128, Expand);
+ setOperationAction(ISD::FCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+ setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ // We need to custom handle any FP_ROUND with an f128 input, but
+ // LegalizeDAG uses the result type to know when to run a custom handler.
+ // So we have to list all legal floating point result types here.
+ if (isTypeLegal(MVT::f32)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+ }
+ if (isTypeLegal(MVT::f64)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
+ }
+ if (isTypeLegal(MVT::f80)) {
+ setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
+ }
+
+ setOperationAction(ISD::SETCC, MVT::f128, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+ }
+
// Always use a library call for pow.
setOperationAction(ISD::FPOW , MVT::f32 , Expand);
setOperationAction(ISD::FPOW , MVT::f64 , Expand);
setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f128 , Expand);
setOperationAction(ISD::FLOG, MVT::f80, Expand);
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
@@ -716,7 +743,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
@@ -754,7 +781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
- for (MVT InnerVT : MVT::vector_valuetypes()) {
+ for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(InnerVT, VT, Expand);
setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
@@ -797,6 +824,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -823,10 +852,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
setOperationAction(ISD::MUL, MVT::v2i8, Custom);
- setOperationAction(ISD::MUL, MVT::v2i16, Custom);
- setOperationAction(ISD::MUL, MVT::v2i32, Custom);
setOperationAction(ISD::MUL, MVT::v4i8, Custom);
- setOperationAction(ISD::MUL, MVT::v4i16, Custom);
setOperationAction(ISD::MUL, MVT::v8i8, Custom);
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
@@ -863,28 +889,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
- if (!ExperimentalVectorWideningLegalization) {
- // Use widening instead of promotion.
- for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
- MVT::v4i16, MVT::v2i16 }) {
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
- }
- }
-
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
- // Provide custom widening for v2f32 setcc. This is really for VLX when
- // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
- // type legalization changing the result type to v4i1 during widening.
- // It works fine for SSE2 and is probably faster so no need to qualify with
- // VLX support.
- setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
-
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
@@ -904,19 +912,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
- // We support custom legalizing of sext and anyext loads for specific
- // memory vector types which we can load as a scalar (or sequence of
- // scalars) and extend in-register to a legal 128-bit vector type. For sext
- // loads these must work with a single scalar load.
- for (MVT VT : MVT::integer_vector_valuetypes()) {
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
- }
-
for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
@@ -938,7 +933,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
@@ -991,18 +985,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
- if (ExperimentalVectorWideningLegalization) {
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
- } else {
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
- }
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
@@ -1069,22 +1059,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
}
- if (!ExperimentalVectorWideningLegalization) {
- // Avoid narrow result types when widening. The legal types are listed
- // in the next loop.
- for (MVT VT : MVT::integer_vector_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
- }
- }
-
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
- if (!ExperimentalVectorWideningLegalization)
- setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
@@ -1145,6 +1123,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom);
+
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
@@ -1292,10 +1272,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STORE, VT, Custom);
}
- if (HasInt256)
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
if (HasInt256) {
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
@@ -1407,6 +1386,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom);
+
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
@@ -1433,12 +1414,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- if (ExperimentalVectorWideningLegalization) {
- // Need to custom widen this if we don't have AVX512BW.
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
- }
+ // Need to custom widen this if we don't have AVX512BW.
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
@@ -1529,10 +1508,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
- // Need to custom split v32i16/v64i8 bitcasts.
if (!Subtarget.hasBWI()) {
+ // Need to custom split v32i16/v64i8 bitcasts.
setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
+
+ // Better to split these into two 256-bit ops.
+ setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
}
if (Subtarget.hasVBMI2()) {
@@ -1777,6 +1760,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSHR, VT, Custom);
}
}
+
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
}
// We want to custom lower some of our intrinsics.
@@ -1905,13 +1892,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxLoadsPerMemcmpOptSize = 2;
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
- setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
+ setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
// An out-of-order CPU can speculatively execute past a predictable branch,
// but a conditional move could be stalled by an expensive earlier operation.
PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
EnableExtLdPromotion = true;
- setPrefFunctionAlignment(4); // 2^4 bytes.
+ setPrefFunctionAlignment(Align(16));
verifyIntrinsicTables();
}
@@ -1939,8 +1926,7 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return TypeSplitVector;
- if (ExperimentalVectorWideningLegalization &&
- VT.getVectorNumElements() != 1 &&
+ if (VT.getVectorNumElements() != 1 &&
VT.getVectorElementType() != MVT::i1)
return TypeWidenVector;
@@ -1950,19 +1936,62 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
+ // v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+ (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
+ return MVT::i8;
+ // FIXME: Should we just make these types legal and custom split operations?
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+ Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ return MVT::v16i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
+ // v32i1 vectors should be promoted to v32i8 to match avx2.
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+ (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
+ return VT.getVectorNumElements();
+ // FIXME: Should we just make these types legal and custom split operations?
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+ Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
+unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ Subtarget.hasAVX512() &&
+ (!isPowerOf2_32(VT.getVectorNumElements()) ||
+ (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
+ (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+ RegisterVT = MVT::i8;
+ IntermediateVT = MVT::i1;
+ NumIntermediates = VT.getVectorNumElements();
+ return NumIntermediates;
+ }
+
+ return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
+ NumIntermediates, RegisterVT);
+}
+
EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext& Context,
EVT VT) const {
@@ -2060,6 +2089,11 @@ EVT X86TargetLowering::getOptimalMemOpType(
if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16)))) {
+ // FIXME: Check if unaligned 64-byte accesses are slow.
+ if (Size >= 64 && Subtarget.hasAVX512() &&
+ (Subtarget.getPreferVectorWidth() >= 512)) {
+ return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
+ }
// FIXME: Check if unaligned 32-byte accesses are slow.
if (Size >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
@@ -2403,8 +2437,8 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
- const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
- SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
+ const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
@@ -2537,7 +2571,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
- Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
+ Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
Subtarget);
assert(2 == RegsToPass.size() &&
@@ -2816,6 +2850,10 @@ SDValue X86TargetLowering::LowerCallResult(
((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (CopyVT == MVT::f64 &&
+ (Is64Bit && !Subtarget.hasSSE2())) {
+ errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2925,7 +2963,7 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
- CC == CallingConv::HHVM);
+ CC == CallingConv::HHVM || CC == CallingConv::Tail);
}
/// Return true if we might ever do TCO for calls with this calling convention.
@@ -2951,7 +2989,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
/// Return true if the function is being made into a tailcall target by
/// changing its ABI.
static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
- return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
+ return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
@@ -3405,7 +3443,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// Find the largest legal vector type.
MVT VecVT = MVT::Other;
// FIXME: Only some x86_32 calling conventions support AVX512.
- if (Subtarget.hasAVX512() &&
+ if (Subtarget.useAVX512Regs() &&
(Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
CallConv == CallingConv::Intel_OCL_BI)))
VecVT = MVT::v16f32;
@@ -3577,6 +3615,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
+ bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
+ CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
@@ -3597,8 +3637,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (Attr.getValueAsString() == "true")
isTailCall = false;
- if (Subtarget.isPICStyleGOT() &&
- !MF.getTarget().Options.GuaranteedTailCallOpt) {
+ if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
// relocation, which forces early binding of the symbol. This breaks code
@@ -3625,7 +3664,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
- if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+ if (!IsGuaranteeTCO && isTailCall)
IsSibcall = true;
if (isTailCall)
@@ -3657,8 +3696,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
- else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
- canGuaranteeTCO(CallConv))
+ else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
@@ -3782,8 +3820,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
// Split v64i1 value into two registers
- Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
- Subtarget);
+ Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
@@ -4069,6 +4106,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+ // Save heapallocsite metadata.
+ if (CLI.CS)
+ if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+ DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
+
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPop;
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
@@ -4190,7 +4232,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!TargetRegisterInfo::isVirtualRegister(VR))
+ if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
@@ -4279,6 +4321,8 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
+ bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
+ CalleeCC == CallingConv::Tail;
// Win64 functions have extra shadow space for argument homing. Don't do the
// sibcall if the caller and callee have mismatched expectations for this
@@ -4286,7 +4330,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
if (IsCalleeWin64 != IsCallerWin64)
return false;
- if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
+ if (IsGuaranteeTCO) {
if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
@@ -4413,7 +4457,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
CCValAssign &VA = ArgLocs[i];
if (!VA.isRegLoc())
continue;
- unsigned Reg = VA.getLocReg();
+ Register Reg = VA.getLocReg();
switch (Reg) {
default: break;
case X86::EAX: case X86::EDX: case X86::ECX:
@@ -4652,7 +4696,11 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
// X < 0 -> X == 0, jump on sign.
return X86::COND_S;
}
- if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
+ // X >= 0 -> X == 0, jump on !sign.
+ return X86::COND_NS;
+ }
+ if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
@@ -4760,7 +4808,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
- Info.align = 1;
+ Info.align = Align::None();
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -4773,7 +4821,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = 1;
+ Info.align = Align::None();
Info.flags |= MachineMemOperand::MOLoad;
break;
}
@@ -4785,7 +4833,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = 1;
+ Info.align = Align::None();
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -4811,6 +4859,8 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
ISD::LoadExtType ExtTy,
EVT NewVT) const {
+ assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
+
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
// relocation target a movq or addq instruction: don't let the load shrink.
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -4852,11 +4902,12 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
+bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
// If we are using XMM registers in the ABI and the condition of the select is
// a floating-point compare and we have blendv or conditional move, then it is
// cheaper to select instead of doing a cross-register move and creating a
// load that depends on the compare result.
+ bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
}
@@ -4869,15 +4920,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
return true;
}
-bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
+bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const {
// TODO: We handle scalars using custom code, but generic combining could make
// that unnecessary.
APInt MulC;
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
+ // Find the type this will be legalized too. Otherwise we might prematurely
+ // convert this to shl+add/sub and then still have to type legalize those ops.
+ // Another choice would be to defer the decision for illegal types until
+ // after type legalization. But constant splat vectors of i64 can't make it
+ // through type legalization on 32-bit targets so we would need to special
+ // case vXi64.
+ while (getTypeAction(Context, VT) != TypeLegal)
+ VT = getTypeToTransformTo(Context, VT);
+
// If vector multiply is legal, assume that's faster than shl + add/sub.
- // TODO: Multiply is a complex op with higher latency and lower througput in
+ // TODO: Multiply is a complex op with higher latency and lower throughput in
// most implementations, so this check could be loosened based on type
// and/or a CPU attribute.
if (isOperationLegal(ISD::MUL, VT))
@@ -5022,6 +5083,33 @@ bool X86TargetLowering::hasAndNot(SDValue Y) const {
return Subtarget.hasSSE2();
}
+bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
+ return X.getValueType().isScalarInteger(); // 'bt'
+}
+
+bool X86TargetLowering::
+ shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const {
+ // Does baseline recommend not to perform the fold by default?
+ if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+ return false;
+ // For scalars this transform is always beneficial.
+ if (X.getValueType().isScalarInteger())
+ return true;
+ // If all the shift amounts are identical, then transform is beneficial even
+ // with rudimentary SSE2 shifts.
+ if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
+ return true;
+ // If we have AVX2 with it's powerful shift operations, then it's also good.
+ if (Subtarget.hasAVX2())
+ return true;
+ // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
+ return NewShiftOpcode == ISD::SHL;
+}
+
bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
@@ -5054,6 +5142,14 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
return true;
}
+bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
+ SDNode *N) const {
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ !Subtarget.isOSWindows())
+ return false;
+ return true;
+}
+
bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
// Any legal vector type can be splatted more efficiently than
// loading/spilling from memory.
@@ -5093,10 +5189,8 @@ static bool isUndefOrZero(int Val) {
/// Return true if every element in Mask, beginning from position Pos and ending
/// in Pos+Size is the undef sentinel value.
static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
- for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
- if (Mask[i] != SM_SentinelUndef)
- return false;
- return true;
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [](int M) { return M == SM_SentinelUndef; });
}
/// Return true if the mask creates a vector whose lower half is undefined.
@@ -5119,10 +5213,7 @@ static bool isInRange(int Val, int Low, int Hi) {
/// Return true if the value of any element in Mask falls within the specified
/// range (L, H].
static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
- for (int M : Mask)
- if (isInRange(M, Low, Hi))
- return true;
- return false;
+ return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
}
/// Return true if Val is undef or if its value falls within the
@@ -5133,12 +5224,9 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
/// Return true if every element in Mask is undef or if its value
/// falls within the specified range (L, H].
-static bool isUndefOrInRange(ArrayRef<int> Mask,
- int Low, int Hi) {
- for (int M : Mask)
- if (!isUndefOrInRange(M, Low, Hi))
- return false;
- return true;
+static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ return llvm::all_of(
+ Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
}
/// Return true if Val is undef, zero or if its value falls within the
@@ -5150,10 +5238,8 @@ static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
/// Return true if every element in Mask is undef, zero or if its value
/// falls within the specified range (L, H].
static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
- for (int M : Mask)
- if (!isUndefOrZeroOrInRange(M, Low, Hi))
- return false;
- return true;
+ return llvm::all_of(
+ Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
}
/// Return true if every element in Mask, beginning
@@ -5171,8 +5257,9 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
/// from position Pos and ending in Pos+Size, falls within the specified
/// sequential range (Low, Low+Size], or is undef or is zero.
static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
- unsigned Size, int Low) {
- for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
+ unsigned Size, int Low,
+ int Step = 1) {
+ for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
return false;
return true;
@@ -5182,10 +5269,8 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
/// from position Pos and ending in Pos+Size is undef or is zero.
static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
unsigned Size) {
- for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
- if (!isUndefOrZero(Mask[i]))
- return false;
- return true;
+ return llvm::all_of(Mask.slice(Pos, Size),
+ [](int M) { return isUndefOrZero(M); });
}
/// Helper function to test whether a shuffle mask could be
@@ -5357,6 +5442,8 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
SDValue Vec;
if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
+ } else if (VT.isFloatingPoint()) {
+ Vec = DAG.getConstantFP(+0.0, dl, VT);
} else if (VT.getVectorElementType() == MVT::i1) {
assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type");
@@ -5500,6 +5587,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
Idx == (VT.getVectorNumElements() / 2) &&
Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
isNullConstant(Src.getOperand(2))) {
Ops.push_back(Src.getOperand(1));
Ops.push_back(Sub);
@@ -5593,7 +5681,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
// May need to promote to a legal type.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ DAG.getConstant(0, dl, WideOpVT),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
@@ -5609,14 +5697,14 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (IdxVal == 0) {
// Zero lower bits of the Vec
- SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ DAG.getConstant(0, dl, WideOpVT),
SubVec, ZeroIdx);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
@@ -5628,7 +5716,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (Vec.isUndef()) {
assert(IdxVal != 0 && "Unexpected index");
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
@@ -5638,30 +5726,30 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
if (ShiftRight != 0)
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
- DAG.getConstant(ShiftRight, dl, MVT::i8));
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
// isel to opimitize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ DAG.getConstant(0, dl, WideOpVT),
Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
- SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
+ SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
}
@@ -5675,30 +5763,47 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- // Move the current value of the bit to be replace to the lsbs.
- Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- // Xor with the new bit.
- Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
- // Shift to MSB, filling bottom bits with 0.
+
+ // Clear the upper bits of the subvector and move it to its insert position.
unsigned ShiftLeft = NumElems - SubVecNumElems;
- Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
- // Shift to the final position, filling upper bits with 0.
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
- Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
- DAG.getConstant(ShiftRight, dl, MVT::i8));
- // Xor with original vector leaving the new value.
- Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+
+ // Isolate the bits below the insertion point.
+ unsigned LowShift = NumElems - IdxVal;
+ SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
+ DAG.getTargetConstant(LowShift, dl, MVT::i8));
+ Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
+ DAG.getTargetConstant(LowShift, dl, MVT::i8));
+
+ // Isolate the bits after the last inserted bit.
+ unsigned HighShift = IdxVal + SubVecNumElems;
+ SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
+ High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
+
+ // Now OR all 3 pieces together.
+ Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
+ SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
+
// Reduce to original width if needed.
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
-static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
- unsigned NumElems, SelectionDAG &DAG,
- const SDLoc &dl, unsigned VectorWidth) {
- SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
- return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
+static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
+ EVT SubVT = V1.getValueType();
+ EVT SubSVT = SubVT.getScalarType();
+ unsigned SubNumElts = SubVT.getVectorNumElements();
+ unsigned SubVectorWidth = SubVT.getSizeInBits();
+ EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
+ SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
+ return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
}
/// Returns a vector of specified type with all bits set.
@@ -5755,6 +5860,34 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
return DAG.getNode(Opcode, DL, VT, In);
}
+// Match (xor X, -1) -> X.
+// Match extract_subvector(xor X, -1) -> extract_subvector(X).
+// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
+ V = peekThroughBitcasts(V);
+ if (V.getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
+ return V.getOperand(0);
+ if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
+ if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
+ Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
+ Not, V.getOperand(1));
+ }
+ }
+ SmallVector<SDValue, 2> CatOps;
+ if (collectConcatOps(V.getNode(), CatOps)) {
+ for (SDValue &CatOp : CatOps) {
+ SDValue NotCat = IsNOT(CatOp, DAG);
+ if (!NotCat) return SDValue();
+ CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
+ }
+ return SDValue();
+}
+
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
@@ -6003,6 +6136,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
}
+ if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ EltSizeInBits <= VT.getScalarSizeInBits()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
+ if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return false;
+
+ SDValue Ptr = MemIntr->getBasePtr();
+ if (Ptr->getOpcode() == X86ISD::Wrapper ||
+ Ptr->getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr->getOperand(0);
+
+ auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
+ if (!CNode || CNode->isMachineConstantPoolEntry() ||
+ CNode->getOffset() != 0)
+ return false;
+
+ if (const Constant *C = CNode->getConstVal()) {
+ unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
+ unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+ APInt UndefSrcElts(NumSrcElts, 0);
+ SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+ if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
+ if (UndefSrcElts[0])
+ UndefSrcElts.setBits(0, NumSrcElts);
+ SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+ }
+ }
+
// Extract constant bits from a subvector broadcast.
if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
SmallVector<APInt, 16> SubEltBits;
@@ -6123,7 +6287,9 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return false;
}
-static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+namespace llvm {
+namespace X86 {
+bool isConstantSplat(SDValue Op, APInt &SplatVal) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
@@ -6146,6 +6312,8 @@ static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
return false;
}
+} // namespace X86
+} // namespace llvm
static bool getTargetShuffleMaskIndices(SDValue MaskNode,
unsigned MaskEltSizeInBits,
@@ -6551,13 +6719,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return true;
}
-/// Check a target shuffle mask's inputs to see if we can set any values to
-/// SM_SentinelZero - this is for elements that are known to be zero
-/// (not just zeroable) from their inputs.
+/// Decode a target shuffle mask and inputs and see if any values are
+/// known to be undef or zero from their inputs.
/// Returns true if the target shuffle mask was decoded.
-static bool setTargetShuffleZeroElements(SDValue N,
- SmallVectorImpl<int> &Mask,
- SmallVectorImpl<SDValue> &Ops) {
+static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops,
+ APInt &KnownUndef, APInt &KnownZero) {
bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
@@ -6566,15 +6733,17 @@ static bool setTargetShuffleZeroElements(SDValue N,
if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
+ int Size = Mask.size();
SDValue V1 = Ops[0];
SDValue V2 = IsUnary ? V1 : Ops[1];
+ KnownUndef = KnownZero = APInt::getNullValue(Size);
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
assert((VT.getSizeInBits() % Mask.size()) == 0 &&
"Illegal split of shuffle value type");
- unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+ unsigned EltSizeInBits = VT.getSizeInBits() / Size;
// Extract known constant input data.
APInt UndefSrcElts[2];
@@ -6585,12 +6754,18 @@ static bool setTargetShuffleZeroElements(SDValue N,
getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
SrcEltBits[1], true, false)};
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ for (int i = 0; i < Size; ++i) {
int M = Mask[i];
// Already decoded as SM_SentinelZero / SM_SentinelUndef.
- if (M < 0)
+ if (M < 0) {
+ assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
+ if (SM_SentinelUndef == M)
+ KnownUndef.setBit(i);
+ if (SM_SentinelZero == M)
+ KnownZero.setBit(i);
continue;
+ }
// Determine shuffle input and normalize the mask.
unsigned SrcIdx = M / Size;
@@ -6599,7 +6774,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
// We are referencing an UNDEF input.
if (V.isUndef()) {
- Mask[i] = SM_SentinelUndef;
+ KnownUndef.setBit(i);
continue;
}
@@ -6612,31 +6787,64 @@ static bool setTargetShuffleZeroElements(SDValue N,
int Scale = Size / V.getValueType().getVectorNumElements();
int Idx = M / Scale;
if (Idx != 0 && !VT.isFloatingPoint())
- Mask[i] = SM_SentinelUndef;
+ KnownUndef.setBit(i);
else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
- Mask[i] = SM_SentinelZero;
+ KnownZero.setBit(i);
continue;
}
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
- Mask[i] = SM_SentinelUndef;
+ KnownUndef.setBit(i);
else if (SrcEltBits[SrcIdx][M] == 0)
- Mask[i] = SM_SentinelZero;
+ KnownZero.setBit(i);
}
}
- assert(VT.getVectorNumElements() == Mask.size() &&
+ assert(VT.getVectorNumElements() == (unsigned)Size &&
"Different mask size from vector size!");
return true;
}
+// Replace target shuffle mask elements with known undef/zero sentinels.
+static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
+ const APInt &KnownUndef,
+ const APInt &KnownZero) {
+ unsigned NumElts = Mask.size();
+ assert(KnownUndef.getBitWidth() == NumElts &&
+ KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (KnownUndef[i])
+ Mask[i] = SM_SentinelUndef;
+ else if (KnownZero[i])
+ Mask[i] = SM_SentinelZero;
+ }
+}
+
+// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
+static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
+ APInt &KnownUndef,
+ APInt &KnownZero) {
+ unsigned NumElts = Mask.size();
+ KnownUndef = KnownZero = APInt::getNullValue(NumElts);
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ if (SM_SentinelUndef == M)
+ KnownUndef.setBit(i);
+ if (SM_SentinelZero == M)
+ KnownZero.setBit(i);
+ }
+}
+
// Forward declaration (for getFauxShuffleMask recursive check).
-static bool resolveTargetShuffleInputs(SDValue Op,
- SmallVectorImpl<SDValue> &Inputs,
- SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG);
+// TODO: Use DemandedElts variant.
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts);
// Attempt to decode ops that could be represented as a shuffle mask.
// The decoded shuffle mask may contain a different number of elements to the
@@ -6644,7 +6852,8 @@ static bool resolveTargetShuffleInputs(SDValue Op,
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts) {
Mask.clear();
Ops.clear();
@@ -6685,7 +6894,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Mask.push_back(SM_SentinelUndef);
continue;
}
- uint64_t ByteBits = EltBits[i].getZExtValue();
+ const APInt &ByteBits = EltBits[i];
if (ByteBits != 0 && ByteBits != 255)
return false;
Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
@@ -6696,8 +6905,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
case ISD::OR: {
// Inspect each operand at the byte level. We can merge these into a
// blend shuffle mask if for each byte at least one is masked out (zero).
- KnownBits Known0 = DAG.computeKnownBits(N.getOperand(0), DemandedElts);
- KnownBits Known1 = DAG.computeKnownBits(N.getOperand(1), DemandedElts);
+ KnownBits Known0 =
+ DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
+ KnownBits Known1 =
+ DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
unsigned NumSizeInBytes = NumSizeInBits / 8;
@@ -6736,14 +6947,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
return false;
SmallVector<int, 64> SrcMask0, SrcMask1;
SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
- if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
- !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
+ if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
+ true) ||
+ !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
+ true))
return false;
- int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
+ size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
- for (int i = 0; i != MaskSize; ++i) {
+ for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
@@ -6751,14 +6964,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
else if (Mask1[i] == SM_SentinelZero)
Mask.push_back(Mask0[i]);
else if (Mask0[i] == SM_SentinelZero)
- Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
+ Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
else
return false;
}
- for (SDValue &Op : SrcInputs0)
- Ops.push_back(Op);
- for (SDValue &Op : SrcInputs1)
- Ops.push_back(Op);
+ Ops.append(SrcInputs0.begin(), SrcInputs0.end());
+ Ops.append(SrcInputs1.begin(), SrcInputs1.end());
return true;
}
case ISD::INSERT_SUBVECTOR: {
@@ -6786,8 +6997,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubInputs;
- if (!resolveTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
- SubMask, DAG))
+ if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
+ SubMask, DAG, Depth + 1, ResolveKnownElts))
return false;
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
@@ -6911,14 +7122,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
// as a truncation shuffle.
if (Opcode == X86ISD::PACKSS) {
if ((!N0.isUndef() &&
- DAG.ComputeNumSignBits(N0, EltsLHS) <= NumBitsPerElt) ||
+ DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
(!N1.isUndef() &&
- DAG.ComputeNumSignBits(N1, EltsRHS) <= NumBitsPerElt))
+ DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
return false;
} else {
APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
- if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS)) ||
- (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS)))
+ if ((!N0.isUndef() &&
+ !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
+ (!N1.isUndef() &&
+ !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
return false;
}
@@ -7061,23 +7274,45 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
Inputs = UsedInputs;
}
-/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
-/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
-/// remaining input indices in case we now have a unary shuffle and adjust the
-/// inputs accordingly.
+/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
+/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
/// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op,
- SmallVectorImpl<SDValue> &Inputs,
- SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG) {
+static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
+ SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ APInt &KnownUndef, APInt &KnownZero,
+ SelectionDAG &DAG, unsigned Depth,
+ bool ResolveKnownElts) {
+ EVT VT = Op.getValueType();
+ if (!VT.isSimple() || !VT.isVector())
+ return false;
+
+ if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
+ if (ResolveKnownElts)
+ resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
+ return true;
+ }
+ if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
+ ResolveKnownElts)) {
+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+ return true;
+ }
+ return false;
+}
+
+static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
+ SmallVectorImpl<int> &Mask,
+ SelectionDAG &DAG, unsigned Depth = 0,
+ bool ResolveKnownElts = true) {
+ EVT VT = Op.getValueType();
+ if (!VT.isSimple() || !VT.isVector())
+ return false;
+
+ APInt KnownUndef, KnownZero;
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt DemandedElts = APInt::getAllOnesValue(NumElts);
- if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
- if (!getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG))
- return false;
-
- resolveTargetShuffleInputsAndMask(Inputs, Mask);
- return true;
+ return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
+ KnownZero, DAG, Depth, ResolveKnownElts);
}
/// Returns the scalar element that will make up the ith
@@ -7414,7 +7649,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
SDLoc DL(Op);
SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getIntPtrConstant(InsertPSMask, DL));
+ DAG.getIntPtrConstant(InsertPSMask, DL, true));
return DAG.getBitcast(VT, Result);
}
@@ -7427,7 +7662,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getBitcast(ShVT, SrcOp);
assert(NumBits % 8 == 0 && "Only support byte sized shifts");
- SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
+ SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
@@ -7439,7 +7674,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
// the shuffle mask.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
SDValue Ptr = LD->getBasePtr();
- if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+ if (!ISD::isNormalLoad(LD) || !LD->isSimple())
return SDValue();
EVT PVT = LD->getValueType(0);
if (PVT != MVT::i32 && PVT != MVT::f32)
@@ -7504,6 +7739,49 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
return SDValue();
}
+// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
+static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
+ if (ISD::isNON_EXTLoad(Elt.getNode())) {
+ auto *BaseLd = cast<LoadSDNode>(Elt);
+ if (!BaseLd->isSimple())
+ return false;
+ Ld = BaseLd;
+ ByteOffset = 0;
+ return true;
+ }
+
+ switch (Elt.getOpcode()) {
+ case ISD::BITCAST:
+ case ISD::TRUNCATE:
+ case ISD::SCALAR_TO_VECTOR:
+ return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
+ case ISD::SRL:
+ if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+ uint64_t Idx = Elt.getConstantOperandVal(1);
+ if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
+ ByteOffset += Idx / 8;
+ return true;
+ }
+ }
+ break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+ SDValue Src = Elt.getOperand(0);
+ unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
+ unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
+ if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
+ findEltLoadSrc(Src, Ld, ByteOffset)) {
+ uint64_t Idx = Elt.getConstantOperandVal(1);
+ ByteOffset += Idx * (SrcSizeInBits / 8);
+ return true;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
@@ -7513,6 +7791,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
+ if ((VT.getScalarSizeInBits() % 8) != 0)
+ return SDValue();
+
unsigned NumElems = Elts.size();
int LastLoadedElt = -1;
@@ -7521,6 +7802,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
APInt UndefMask = APInt::getNullValue(NumElems);
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+ SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
@@ -7539,13 +7821,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Each loaded element must be the correct fractional portion of the
// requested vector load.
- if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
+ unsigned EltSizeInBits = Elt.getValueSizeInBits();
+ if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
return SDValue();
- if (!ISD::isNON_EXTLoad(Elt.getNode()))
+ if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
+ return SDValue();
+ unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
+ if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
return SDValue();
- Loads[i] = cast<LoadSDNode>(Elt);
LoadMask.setBit(i);
LastLoadedElt = i;
}
@@ -7575,6 +7860,24 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
+ // TODO: Support offsetting the base load.
+ if (ByteOffsets[FirstLoadedElt] != 0)
+ return SDValue();
+
+ // Check to see if the element's load is consecutive to the base load
+ // or offset from a previous (already checked) load.
+ auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
+ LoadSDNode *Ld = Loads[EltIdx];
+ int64_t ByteOffset = ByteOffsets[EltIdx];
+ if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
+ int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
+ return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
+ Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
+ }
+ return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
+ EltIdx - FirstLoadedElt);
+ };
+
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.
@@ -7582,8 +7885,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
- if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
- i - FirstLoadedElt)) {
+ if (!CheckConsecutiveLoad(LDBase, i)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
@@ -7595,8 +7897,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
- assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
- "Cannot merge volatile loads.");
+ assert(LDBase->isSimple() &&
+ "Cannot merge volatile or atomic loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
@@ -7636,17 +7938,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
if (!isAfterLegalize && VT.isVector()) {
- SmallVector<int, 4> ClearMask(NumElems, -1);
- for (unsigned i = 0; i < NumElems; ++i) {
- if (ZeroMask[i])
- ClearMask[i] = i + NumElems;
- else if (LoadMask[i])
- ClearMask[i] = i;
+ unsigned NumMaskElts = VT.getVectorNumElements();
+ if ((NumMaskElts % NumElems) == 0) {
+ unsigned Scale = NumMaskElts / NumElems;
+ SmallVector<int, 4> ClearMask(NumMaskElts, -1);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (UndefMask[i])
+ continue;
+ int Offset = ZeroMask[i] ? NumMaskElts : 0;
+ for (unsigned j = 0; j != Scale; ++j)
+ ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
+ }
+ SDValue V = CreateLoad(VT, LDBase);
+ SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
+ : DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
}
- SDValue V = CreateLoad(VT, LDBase);
- SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
- : DAG.getConstantFP(0.0, DL, VT);
- return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
}
}
@@ -8194,34 +8501,10 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
"Unexpected type in LowerBUILD_VECTORvXi1!");
SDLoc dl(Op);
- if (ISD::isBuildVectorAllZeros(Op.getNode()))
- return Op;
-
- if (ISD::isBuildVectorAllOnes(Op.getNode()))
+ if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
+ ISD::isBuildVectorAllOnes(Op.getNode()))
return Op;
- if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
- // Split the pieces.
- SDValue Lower =
- DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
- SDValue Upper =
- DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
- // We have to manually lower both halves so getNode doesn't try to
- // reassemble the build_vector.
- Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
- Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
- }
- SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
- if (Imm.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, Imm);
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
- DAG.getIntPtrConstant(0, dl));
- }
-
- // Vector has one or more non-const elements
uint64_t Immediate = 0;
SmallVector<unsigned, 16> NonConstIdx;
bool IsSplat = true;
@@ -8244,29 +8527,40 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
}
// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
- if (IsSplat)
- return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+ if (IsSplat) {
+ // The build_vector allows the scalar element to be larger than the vector
+ // element type. We need to mask it to use as a condition unless we know
+ // the upper bits are zero.
+ // FIXME: Use computeKnownBits instead of checking specific opcode?
+ SDValue Cond = Op.getOperand(SplatIdx);
+ assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
+ if (Cond.getOpcode() != ISD::SETCC)
+ Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
+ DAG.getConstant(1, dl, MVT::i8));
+ return DAG.getSelect(dl, VT, Cond,
DAG.getConstant(1, dl, VT),
DAG.getConstant(0, dl, VT));
+ }
// insert elements one by one
SDValue DstVec;
- SDValue Imm;
- if (Immediate) {
- MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
- Imm = DAG.getConstant(Immediate, dl, ImmVT);
- }
- else if (HasConstElts)
- Imm = DAG.getConstant(0, dl, VT);
- else
- Imm = DAG.getUNDEF(VT);
- if (Imm.getValueSizeInBits() == VT.getSizeInBits())
- DstVec = DAG.getBitcast(VT, Imm);
- else {
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
- DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
- DAG.getIntPtrConstant(0, dl));
- }
+ if (HasConstElts) {
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
+ SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
+ ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
+ ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
+ DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
+ } else {
+ MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));
+ SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+ DstVec = DAG.getBitcast(VecVT, Imm);
+ DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ } else
+ DstVec = DAG.getUNDEF(VT);
for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
unsigned InsertIdx = NonConstIdx[i];
@@ -8757,7 +9051,7 @@ static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
// If we don't need the upper xmm, then perform as a xmm hop.
unsigned HalfNumElts = NumElts / 2;
if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
- MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
@@ -8965,21 +9259,14 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
MVT VT = Op.getSimpleValueType();
// Vectors containing all zeros can be matched by pxor and xorps.
- if (ISD::isBuildVectorAllZeros(Op.getNode())) {
- // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
- // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
- if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
- return Op;
-
- return getZeroVector(VT, Subtarget, DAG, DL);
- }
+ if (ISD::isBuildVectorAllZeros(Op.getNode()))
+ return Op;
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
- if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
- (VT == MVT::v8i32 && Subtarget.hasInt256()))
+ if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
return Op;
return getOnesVector(VT, DAG, DL);
@@ -9150,9 +9437,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
{4, 5, 6, 7, 4, 5, 6, 7});
if (Subtarget.hasXOP())
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
- LoLo, HiHi, IndicesVec,
- DAG.getConstant(0, DL, MVT::i8)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
+ IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPS only uses index bits[0:1] to permute elements.
SDValue Res = DAG.getSelectCC(
@@ -9186,9 +9473,9 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
if (Subtarget.hasXOP())
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
- LoLo, HiHi, IndicesVec,
- DAG.getConstant(0, DL, MVT::i8)));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
+ IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
// Permute Lo and Hi and then select based on index range.
// This works as VPERMILPD only uses index bit[1] to permute elements.
SDValue Res = DAG.getSelectCC(
@@ -9283,7 +9570,7 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
return SDValue();
auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
- if (!PermIdx || PermIdx->getZExtValue() != Idx)
+ if (!PermIdx || PermIdx->getAPIntValue() != Idx)
return SDValue();
}
@@ -9434,23 +9721,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// it to i32 first.
if (EltVT == MVT::i16 || EltVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
- if (VT.getSizeInBits() >= 256) {
- MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
- if (Subtarget.hasAVX()) {
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
- Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
- } else {
- // Without AVX, we need to extend to a 128-bit vector and then
- // insert into the 256-bit vector.
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
- SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
- Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
- }
- } else {
- assert(VT.is128BitVector() && "Expected an SSE value type!");
- Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
- Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
- }
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
return DAG.getBitcast(VT, Item);
}
}
@@ -9549,8 +9822,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
- return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
- VT.getSizeInBits() / 2);
+ return concatSubVectors(Lower, Upper, DAG, dl);
}
// Let legalizer expand 2-wide build_vectors.
@@ -9703,8 +9975,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
// If we have more than 2 non-zeros, build each half separately.
if (NumNonZero > 2) {
- MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
- ResVT.getVectorNumElements()/2);
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
@@ -9745,30 +10016,47 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
- unsigned NumZero = 0;
- unsigned NumNonZero = 0;
+ uint64_t Zeros = 0;
uint64_t NonZeros = 0;
for (unsigned i = 0; i != NumOperands; ++i) {
SDValue SubVec = Op.getOperand(i);
if (SubVec.isUndef())
continue;
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
- ++NumZero;
- else {
- assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ Zeros |= (uint64_t)1 << i;
+ else
NonZeros |= (uint64_t)1 << i;
- ++NumNonZero;
- }
}
+ unsigned NumElems = ResVT.getVectorNumElements();
+
+ // If we are inserting non-zero vector and there are zeros in LSBs and undef
+ // in the MSBs we need to emit a KSHIFTL. The generic lowering to
+ // insert_subvector will give us two kshifts.
+ if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
+ Log2_64(NonZeros) != NumOperands - 1) {
+ MVT ShiftVT = ResVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
+ ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ unsigned Idx = Log2_64(NonZeros);
+ SDValue SubVec = Op.getOperand(Idx);
+ unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
+ DAG.getUNDEF(ShiftVT), SubVec,
+ DAG.getIntPtrConstant(0, dl));
+ Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
+ DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
+ DAG.getIntPtrConstant(0, dl));
+ }
// If there are zero or one non-zeros we can handle this very simply.
- if (NumNonZero <= 1) {
- SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
- : DAG.getUNDEF(ResVT);
- if (!NumNonZero)
+ if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
+ SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
+ if (!NonZeros)
return Vec;
- unsigned Idx = countTrailingZeros(NonZeros);
+ unsigned Idx = Log2_64(NonZeros);
SDValue SubVec = Op.getOperand(Idx);
unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
@@ -9776,8 +10064,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
}
if (NumOperands > 2) {
- MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
- ResVT.getVectorNumElements()/2);
+ MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
Ops.slice(0, NumOperands/2));
@@ -9786,7 +10073,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
- assert(NumNonZero == 2 && "Simple cases not handled?");
+ assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?");
if (ResVT.getVectorNumElements() >= 16)
return Op; // The operation is legal with KUNPCK
@@ -9794,7 +10081,6 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
DAG.getUNDEF(ResVT), Op.getOperand(0),
DAG.getIntPtrConstant(0, dl));
- unsigned NumElems = ResVT.getVectorNumElements();
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
DAG.getIntPtrConstant(NumElems/2, dl));
}
@@ -9997,42 +10283,44 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
/// value in ExpectedMask is always accepted. Otherwise the indices must match.
///
-/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+/// SM_SentinelZero is accepted as a valid negative index but must match in
+/// both.
static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
- ArrayRef<int> ExpectedMask) {
+ ArrayRef<int> ExpectedMask,
+ SDValue V1 = SDValue(),
+ SDValue V2 = SDValue()) {
int Size = Mask.size();
if (Size != (int)ExpectedMask.size())
return false;
assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask");
- for (int i = 0; i < Size; ++i)
- if (Mask[i] == SM_SentinelUndef)
- continue;
- else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
- return false;
- else if (Mask[i] != ExpectedMask[i])
- return false;
-
- return true;
-}
+ // Check for out-of-range target shuffle mask indices.
+ if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
+ return false;
-// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
-// mask.
-static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
- const APInt &Zeroable) {
- int NumElts = Mask.size();
- assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
+ BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
+ BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
- SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
continue;
- assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
- TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+ if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (MaskBV && ExpectedBV &&
+ MaskBV->getOperand(Mask[i] % Size) ==
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ continue;
+ }
+ // TODO - handle SM_Sentinel equivalences.
+ return false;
}
- return TargetMask;
+ return true;
}
// Attempt to create a shuffle mask from a VSELECT condition mask.
@@ -10133,7 +10421,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
- return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
+ return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
/// Compute whether each element of a shuffle is zeroable.
@@ -10573,14 +10861,14 @@ static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
// Try binary shuffle.
SmallVector<int, 32> BinaryMask;
createPackShuffleMask(VT, BinaryMask, false);
- if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
+ if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
if (MatchPACK(V1, V2))
return true;
// Try unary shuffle.
SmallVector<int, 32> UnaryMask;
createPackShuffleMask(VT, UnaryMask, true);
- if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
+ if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
if (MatchPACK(V1, V1))
return true;
@@ -10685,9 +10973,9 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SelectionDAG &DAG);
static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
- MutableArrayRef<int> TargetMask,
- bool &ForceV1Zero, bool &ForceV2Zero,
- uint64_t &BlendMask) {
+ MutableArrayRef<int> Mask,
+ const APInt &Zeroable, bool &ForceV1Zero,
+ bool &ForceV2Zero, uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
@@ -10695,13 +10983,12 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
BlendMask = 0;
ForceV1Zero = false, ForceV2Zero = false;
- assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
+ assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
// Attempt to generate the binary blend mask. If an input is zero then
// we can use any lane.
- // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
- for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
- int M = TargetMask[i];
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
if (M == i)
@@ -10710,16 +10997,16 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
BlendMask |= 1ull << i;
continue;
}
- if (M == SM_SentinelZero) {
+ if (Zeroable[i]) {
if (V1IsZeroOrUndef) {
ForceV1Zero = true;
- TargetMask[i] = i;
+ Mask[i] = i;
continue;
}
if (V2IsZeroOrUndef) {
ForceV2Zero = true;
BlendMask |= 1ull << i;
- TargetMask[i] = i + Size;
+ Mask[i] = i + Size;
continue;
}
}
@@ -10748,11 +11035,10 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
-
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
- if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+ SmallVector<int, 64> Mask(Original.begin(), Original.end());
+ if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
BlendMask))
return SDValue();
@@ -10778,7 +11064,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v8i16:
assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
case MVT::v16i16: {
assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
SmallVector<int, 8> RepeatedMask;
@@ -10790,7 +11076,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (RepeatedMask[i] >= 8)
BlendMask |= 1ull << i;
return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(BlendMask, DL, MVT::i8));
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
}
// Use PBLENDW for lower/upper lanes and then blend lanes.
// TODO - we should allow 2 PBLENDW here and leave shuffle combine to
@@ -10799,9 +11085,9 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
uint64_t HiMask = (BlendMask >> 8) & 0xFF;
if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(LoMask, DL, MVT::i8));
+ DAG.getTargetConstant(LoMask, DL, MVT::i8));
SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
- DAG.getConstant(HiMask, DL, MVT::i8));
+ DAG.getTargetConstant(HiMask, DL, MVT::i8));
return DAG.getVectorShuffle(
MVT::v16i16, DL, Lo, Hi,
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
@@ -11061,7 +11347,7 @@ static SDValue lowerShuffleAsByteRotateAndPermute(
SDValue Rotate = DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
DAG.getBitcast(ByteVT, Lo),
- DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
+ DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
@@ -11268,7 +11554,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
"512-bit PALIGNR requires BWI instructions");
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
- DAG.getConstant(ByteRotation, DL, MVT::i8)));
+ DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
}
assert(VT.is128BitVector() &&
@@ -11282,10 +11568,12 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
int LoByteShift = 16 - ByteRotation;
int HiByteShift = ByteRotation;
- SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
- DAG.getConstant(LoByteShift, DL, MVT::i8));
- SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
- DAG.getConstant(HiByteShift, DL, MVT::i8));
+ SDValue LoShift =
+ DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
+ DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
+ SDValue HiShift =
+ DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
+ DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
return DAG.getBitcast(VT,
DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
}
@@ -11317,7 +11605,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
return SDValue();
return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
- DAG.getConstant(Rotation, DL, MVT::i8));
+ DAG.getTargetConstant(Rotation, DL, MVT::i8));
}
/// Try to lower a vector shuffle as a byte shift sequence.
@@ -11356,27 +11644,27 @@ static SDValue lowerVectorShuffleAsByteShiftMask(
if (ZeroLo == 0) {
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * ZeroHi, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
} else if (ZeroHi == 0) {
unsigned Shift = Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else if (!Subtarget.hasSSSE3()) {
// If we don't have PSHUFB then its worth avoiding an AND constant mask
// by performing 3 byte shifts. Shuffle combining can kick in above that.
// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Shift += Mask[ZeroLo] % NumElts;
Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * Shift, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
- DAG.getConstant(Scale * ZeroLo, DL, MVT::i8));
+ DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
} else
return SDValue();
@@ -11498,7 +11786,7 @@ static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
"Illegal integer vector type");
V = DAG.getBitcast(ShiftVT, V);
V = DAG.getNode(Opcode, DL, ShiftVT, V,
- DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
return DAG.getBitcast(VT, V);
}
@@ -11632,14 +11920,14 @@ static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
V2 ? V2 : DAG.getUNDEF(VT),
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return SDValue();
}
@@ -11686,9 +11974,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
};
- // Found a valid zext mask! Try various lowering strategies based on the
+ // Found a valid a/zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
- // TODO: Add AnyExt support.
if (Subtarget.hasSSE41()) {
// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
// PUNPCK will catch this in a later shuffle match.
@@ -11697,7 +11984,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
InputV = ShuffleOffset(InputV);
- InputV = getExtendInVec(ISD::ZERO_EXTEND, DL, ExtVT, InputV, DAG);
+ InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
+ ExtVT, InputV, DAG);
return DAG.getBitcast(VT, InputV);
}
@@ -11736,8 +12024,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
int LoIdx = Offset * EltBits;
SDValue Lo = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(LoIdx, DL, MVT::i8)));
+ DAG.getTargetConstant(EltBits, DL, MVT::i8),
+ DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
return DAG.getBitcast(VT, Lo);
@@ -11745,8 +12033,8 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
int HiIdx = (Offset + 1) * EltBits;
SDValue Hi = DAG.getBitcast(
MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(HiIdx, DL, MVT::i8)));
+ DAG.getTargetConstant(EltBits, DL, MVT::i8),
+ DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
return DAG.getBitcast(VT,
DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
}
@@ -11759,8 +12047,12 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
SDValue PSHUFBMask[16];
for (int i = 0; i < 16; ++i) {
int Idx = Offset + (i / Scale);
- PSHUFBMask[i] = DAG.getConstant(
- (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+ if ((i % Scale == 0 && SafeOffset(Idx))) {
+ PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
+ continue;
+ }
+ PSHUFBMask[i] =
+ AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
}
InputV = DAG.getBitcast(MVT::v16i8, InputV);
return DAG.getBitcast(
@@ -12052,9 +12344,9 @@ static SDValue lowerShuffleAsElementInsertion(
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
} else {
V2 = DAG.getBitcast(MVT::v16i8, V2);
- V2 = DAG.getNode(
- X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
- DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
+ V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
+ DAG.getTargetConstant(
+ V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
V2 = DAG.getBitcast(VT, V2);
}
}
@@ -12294,7 +12586,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+ } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
@@ -12486,7 +12778,7 @@ static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
// Insert the V2 element into the desired position.
return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
/// Try to lower a shuffle as a permute of the inputs followed by an
@@ -12635,14 +12927,14 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
}
return DAG.getNode(
X86ISD::SHUFP, DL, MVT::v2f64,
Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
}
assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
@@ -12688,7 +12980,7 @@ static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
}
/// Handle lowering of 2-lane 64-bit integer shuffles.
@@ -12996,10 +13288,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
- Mask, Subtarget, DAG))
- return Broadcast;
+ // Try to use broadcast unless the mask only has one non-undef element.
+ if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+ }
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
@@ -13680,16 +13974,16 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
if (NumV2Inputs == 0) {
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
- Mask, Subtarget, DAG))
- return Broadcast;
-
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
Zeroable, Subtarget, DAG))
return Shift;
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
@@ -13984,8 +14278,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
// Unpack the bytes to form the i16s that will be shuffled into place.
+ bool EvenInUse = false, OddInUse = false;
+ for (int i = 0; i < 16; i += 2) {
+ EvenInUse |= (Mask[i + 0] >= 0);
+ OddInUse |= (Mask[i + 1] >= 0);
+ if (EvenInUse && OddInUse)
+ break;
+ }
V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
- MVT::v16i8, V1, V1);
+ MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
+ OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
for (int i = 0; i < 16; ++i)
@@ -14100,11 +14402,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
"No support for dropping even elements more than 3 times.");
- // We use the mask type to pick which bytes are preserved based on how many
- // elements are dropped.
- MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
- SDValue ByteClearMask = DAG.getBitcast(
- MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
+ SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
+ for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
+ ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
+ SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
if (!IsSingleInput)
V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
@@ -14448,16 +14749,14 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
}
-/// Lower a vector shuffle crossing multiple 128-bit lanes as
-/// a permutation and blend of those lanes.
+/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
+/// source with a lane permutation.
///
-/// This essentially blends the out-of-lane inputs to each lane into the lane
-/// from a permuted copy of the vector. This lowering strategy results in four
-/// instructions in the worst case for a single-input cross lane shuffle which
-/// is lower than any other fully general cross-lane shuffle strategy I'm aware
-/// of. Special cases for each particular shuffle pattern should be handled
-/// prior to trying this lowering.
-static SDValue lowerShuffleAsLanePermuteAndBlend(
+/// This lowering strategy results in four instructions in the worst case for a
+/// single-input cross lane shuffle which is lower than any other fully general
+/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
+/// shuffle pattern should be handled prior to trying this lowering.
+static SDValue lowerShuffleAsLanePermuteAndShuffle(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG, const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
@@ -14484,24 +14783,28 @@ static SDValue lowerShuffleAsLanePermuteAndBlend(
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
+ // TODO - we could support shuffling V2 in the Flipped input.
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
- SmallVector<int, 32> FlippedBlendMask(Size);
- for (int i = 0; i < Size; ++i)
- FlippedBlendMask[i] =
- Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
- ? Mask[i]
- : Mask[i] % LaneSize +
- (i / LaneSize) * LaneSize + Size);
+ SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
+ for (int i = 0; i < Size; ++i) {
+ int &M = InLaneMask[i];
+ if (M < 0)
+ continue;
+ if (((M % Size) / LaneSize) != (i / LaneSize))
+ M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+ }
+ assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
+ "In-lane shuffle mask expected");
- // Flip the vector, and blend the results which should now be in-lane.
+ // Flip the lanes, and shuffle the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
SDValue Flipped = DAG.getBitcast(PVT, V1);
- Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
- { 2, 3, 0, 1 });
+ Flipped =
+ DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
Flipped = DAG.getBitcast(VT, Flipped);
- return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
}
/// Handle lowering 2-lane 128-bit shuffles.
@@ -14565,8 +14868,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
((WidenedMask[1] % 2) << 1);
- return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
- DAG.getConstant(PermMask, DL, MVT::i8));
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
}
}
@@ -14598,7 +14901,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
V2 = DAG.getUNDEF(VT);
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
- DAG.getConstant(PermMask, DL, MVT::i8));
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
/// Lower a vector shuffle by first fixing the 128-bit lanes and then
@@ -14616,26 +14919,26 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
if (is128BitLaneRepeatedShuffleMask(VT, Mask))
return SDValue();
- int Size = Mask.size();
+ int NumElts = Mask.size();
int NumLanes = VT.getSizeInBits() / 128;
- int LaneSize = 128 / VT.getScalarSizeInBits();
- SmallVector<int, 16> RepeatMask(LaneSize, -1);
+ int NumLaneElts = 128 / VT.getScalarSizeInBits();
+ SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
// First pass will try to fill in the RepeatMask from lanes that need two
// sources.
for (int Lane = 0; Lane != NumLanes; ++Lane) {
- int Srcs[2] = { -1, -1 };
- SmallVector<int, 16> InLaneMask(LaneSize, -1);
- for (int i = 0; i != LaneSize; ++i) {
- int M = Mask[(Lane * LaneSize) + i];
+ int Srcs[2] = {-1, -1};
+ SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = Mask[(Lane * NumLaneElts) + i];
if (M < 0)
continue;
// Determine which of the possible input lanes (NumLanes from each source)
// this element comes from. Assign that as one of the sources for this
// lane. We can assign up to 2 sources for this lane. If we run out
// sources we can't do anything.
- int LaneSrc = M / LaneSize;
+ int LaneSrc = M / NumLaneElts;
int Src;
if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
Src = 0;
@@ -14645,7 +14948,7 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
return SDValue();
Srcs[Src] = LaneSrc;
- InLaneMask[i] = (M % LaneSize) + Src * Size;
+ InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
}
// If this lane has two sources, see if it fits with the repeat mask so far.
@@ -14701,23 +15004,23 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
if (LaneSrcs[Lane][0] >= 0)
continue;
- for (int i = 0; i != LaneSize; ++i) {
- int M = Mask[(Lane * LaneSize) + i];
+ for (int i = 0; i != NumLaneElts; ++i) {
+ int M = Mask[(Lane * NumLaneElts) + i];
if (M < 0)
continue;
// If RepeatMask isn't defined yet we can define it ourself.
if (RepeatMask[i] < 0)
- RepeatMask[i] = M % LaneSize;
+ RepeatMask[i] = M % NumLaneElts;
- if (RepeatMask[i] < Size) {
- if (RepeatMask[i] != M % LaneSize)
+ if (RepeatMask[i] < NumElts) {
+ if (RepeatMask[i] != M % NumLaneElts)
return SDValue();
- LaneSrcs[Lane][0] = M / LaneSize;
+ LaneSrcs[Lane][0] = M / NumLaneElts;
} else {
- if (RepeatMask[i] != ((M % LaneSize) + Size))
+ if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
return SDValue();
- LaneSrcs[Lane][1] = M / LaneSize;
+ LaneSrcs[Lane][1] = M / NumLaneElts;
}
}
@@ -14725,14 +15028,14 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
return SDValue();
}
- SmallVector<int, 16> NewMask(Size, -1);
+ SmallVector<int, 16> NewMask(NumElts, -1);
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][0];
- for (int i = 0; i != LaneSize; ++i) {
+ for (int i = 0; i != NumLaneElts; ++i) {
int M = -1;
if (Src >= 0)
- M = Src * LaneSize + i;
- NewMask[Lane * LaneSize + i] = M;
+ M = Src * NumLaneElts + i;
+ NewMask[Lane * NumLaneElts + i] = M;
}
}
SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
@@ -14745,11 +15048,11 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
for (int Lane = 0; Lane != NumLanes; ++Lane) {
int Src = LaneSrcs[Lane][1];
- for (int i = 0; i != LaneSize; ++i) {
+ for (int i = 0; i != NumLaneElts; ++i) {
int M = -1;
if (Src >= 0)
- M = Src * LaneSize + i;
- NewMask[Lane * LaneSize + i] = M;
+ M = Src * NumLaneElts + i;
+ NewMask[Lane * NumLaneElts + i] = M;
}
}
SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
@@ -14760,12 +15063,12 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
return SDValue();
- for (int i = 0; i != Size; ++i) {
- NewMask[i] = RepeatMask[i % LaneSize];
+ for (int i = 0; i != NumElts; ++i) {
+ NewMask[i] = RepeatMask[i % NumLaneElts];
if (NewMask[i] < 0)
continue;
- NewMask[i] += (i / LaneSize) * LaneSize;
+ NewMask[i] += (i / NumLaneElts) * NumLaneElts;
}
return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
}
@@ -14831,14 +15134,13 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> HalfMask, int HalfIdx1,
int HalfIdx2, bool UndefLower,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG, bool UseConcat = false) {
assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
assert(V1.getValueType().isSimple() && "Expecting only simple types");
MVT VT = V1.getSimpleValueType();
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ unsigned HalfNumElts = HalfVT.getVectorNumElements();
auto getHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
@@ -14853,6 +15155,14 @@ static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
SDValue Half1 = getHalfVector(HalfIdx1);
SDValue Half2 = getHalfVector(HalfIdx2);
SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ if (UseConcat) {
+ SDValue Op0 = V;
+ SDValue Op1 = DAG.getUNDEF(HalfVT);
+ if (UndefLower)
+ std::swap(Op0, Op1);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
+ }
+
unsigned Offset = UndefLower ? HalfNumElts : 0;
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
DAG.getIntPtrConstant(Offset, DL));
@@ -14877,9 +15187,8 @@ static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
// Upper half is undef and lower half is whole upper subvector.
// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- unsigned NumElts = VT.getVectorNumElements();
- unsigned HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ unsigned HalfNumElts = HalfVT.getVectorNumElements();
if (!UndefLower &&
isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
@@ -15155,11 +15464,19 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
}
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &ShuffleImm, ArrayRef<int> Mask) {
+ bool &ForceV1Zero, bool &ForceV2Zero,
+ unsigned &ShuffleImm, ArrayRef<int> Mask,
+ const APInt &Zeroable) {
int NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) &&
"Unexpected data type for VSHUFPD");
+ assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
+ "Illegal shuffle mask");
+
+ bool ZeroLane[2] = { true, true };
+ for (int i = 0; i < NumElts; ++i)
+ ZeroLane[i & 1] &= Zeroable[i];
// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
@@ -15167,7 +15484,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
bool ShufpdMask = true;
bool CommutableMask = true;
for (int i = 0; i < NumElts; ++i) {
- if (Mask[i] == SM_SentinelUndef)
+ if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
continue;
if (Mask[i] < 0)
return false;
@@ -15180,30 +15497,77 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
ShuffleImm |= (Mask[i] % 2) << i;
}
- if (ShufpdMask)
- return true;
- if (CommutableMask) {
+ if (!ShufpdMask && !CommutableMask)
+ return false;
+
+ if (!ShufpdMask && CommutableMask)
std::swap(V1, V2);
- return true;
- }
- return false;
+ ForceV1Zero = ZeroLane[0];
+ ForceV2Zero = ZeroLane[1];
+ return true;
}
-static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
- assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD");
unsigned Immediate = 0;
- if (!matchShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
+ Mask, Zeroable))
return SDValue();
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
- DAG.getConstant(Immediate, DL, MVT::i8));
+ DAG.getTargetConstant(Immediate, DL, MVT::i8));
}
+// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+// by zeroable elements in the remaining 24 elements. Turn this into two
+// vmovqb instructions shuffled together.
+static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SelectionDAG &DAG) {
+ assert(VT == MVT::v32i8 && "Unexpected type!");
+
+ // The first 8 indices should be every 8th element.
+ if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
+ return SDValue();
+
+ // Remaining elements need to be zeroable.
+ if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
+ return SDValue();
+
+ V1 = DAG.getBitcast(MVT::v4i64, V1);
+ V2 = DAG.getBitcast(MVT::v4i64, V2);
+
+ V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
+ V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
+
+ // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
+ // the upper bits of the result using an unpckldq.
+ SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ 4, 5, 6, 7, 20, 21, 22, 23 });
+ // Insert the unpckldq into a zero vector to widen to v32i8.
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
+ DAG.getConstant(0, DL, MVT::v32i8), Unpack,
+ DAG.getIntPtrConstant(0, DL));
+}
+
+
/// Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -15236,7 +15600,7 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
- DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
}
// With AVX2 we have direct support for this permutation.
@@ -15256,8 +15620,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Otherwise, fall back.
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG,
- Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ DAG, Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
@@ -15269,7 +15633,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Op;
// If we have one input in place, then we can permute the other input and
@@ -15473,8 +15838,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
// Otherwise, fall back.
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
+ DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -15681,8 +16046,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, Mask,
- DAG, Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
+ DAG, Subtarget);
}
SmallVector<int, 8> RepeatedMask;
@@ -15780,8 +16145,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
- return lowerShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, DAG,
- Subtarget);
+ return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
+ DAG, Subtarget);
}
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
@@ -15803,6 +16168,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
+ // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
+ // by zeroable elements in the remaining 24 elements. Turn this into two
+ // vmovqb instructions shuffled together.
+ if (Subtarget.hasVLX())
+ if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
+ Mask, Zeroable, DAG))
+ return V;
+
// Otherwise fall back on generic lowering.
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG);
@@ -15974,7 +16347,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
- DAG.getConstant(PermMask, DL, MVT::i8));
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
/// Handle lowering of 8-lane 64-bit floating point shuffles.
@@ -15999,7 +16372,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
- DAG.getConstant(VPERMILPMask, DL, MVT::i8));
+ DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
}
SmallVector<int, 4> RepeatedMask;
@@ -16016,7 +16389,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Unpck;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return Op;
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
@@ -16389,6 +16763,49 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
}
}
+static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
+ MVT VT, SDValue V1, SDValue V2,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Shuffle should be unary.
+ if (!V2.isUndef())
+ return SDValue();
+
+ int ShiftAmt = -1;
+ int NumElts = Mask.size();
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+ assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
+ "Unexpected mask index.");
+ if (M < 0)
+ continue;
+
+ // The first non-undef element determines our shift amount.
+ if (ShiftAmt < 0) {
+ ShiftAmt = M - i;
+ // Need to be shifting right.
+ if (ShiftAmt <= 0)
+ return SDValue();
+ }
+ // All non-undef elements must shift by the same amount.
+ if (ShiftAmt != M - i)
+ return SDValue();
+ }
+ assert(ShiftAmt >= 0 && "All undef?");
+
+ // Great we found a shift right.
+ MVT WideVT = VT;
+ if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+ WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+ DAG.getUNDEF(WideVT), V1,
+ DAG.getIntPtrConstant(0, DL));
+ Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+}
+
// Determine if this shuffle can be implemented with a KSHIFT instruction.
// Returns the shift amount if possible or -1 if not. This is a simplified
// version of matchShuffleAsShift.
@@ -16434,13 +16851,20 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/o basic ISA!");
- unsigned NumElts = Mask.size();
+ int NumElts = Mask.size();
// Try to recognize shuffles that are just padding a subvector with zeros.
- unsigned SubvecElts = 0;
- for (int i = 0; i != (int)NumElts; ++i) {
- if (Mask[i] >= 0 && Mask[i] != i)
- break;
+ int SubvecElts = 0;
+ int Src = -1;
+ for (int i = 0; i != NumElts; ++i) {
+ if (Mask[i] >= 0) {
+ // Grab the source from the first valid mask. All subsequent elements need
+ // to use this same source.
+ if (Src < 0)
+ Src = Mask[i] / NumElts;
+ if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
+ break;
+ }
++SubvecElts;
}
@@ -16451,30 +16875,54 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Make sure the number of zeroable bits in the top at least covers the bits
// not covered by the subvector.
- if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+ if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
+ assert(Src >= 0 && "Expected a source!");
MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
- V1, DAG.getIntPtrConstant(0, DL));
+ Src == 0 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- getZeroVector(VT, Subtarget, DAG, DL),
+ DAG.getConstant(0, DL, VT),
Extract, DAG.getIntPtrConstant(0, DL));
}
+ // Try a simple shift right with undef elements. Later we'll try with zeros.
+ if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
+ DAG))
+ return Shift;
+
// Try to match KSHIFTs.
- // TODO: Support narrower than legal shifts by widening and extracting.
- if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) {
- unsigned Offset = 0;
- for (SDValue V : { V1, V2 }) {
- unsigned Opcode;
- int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
- if (ShiftAmt >= 0)
- return DAG.getNode(Opcode, DL, VT, V,
- DAG.getConstant(ShiftAmt, DL, MVT::i8));
- Offset += NumElts; // Increment for next iteration.
+ unsigned Offset = 0;
+ for (SDValue V : { V1, V2 }) {
+ unsigned Opcode;
+ int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
+ if (ShiftAmt >= 0) {
+ MVT WideVT = VT;
+ if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
+ WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
+ DAG.getUNDEF(WideVT), V,
+ DAG.getIntPtrConstant(0, DL));
+ // Widened right shifts need two shifts to ensure we shift in zeroes.
+ if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
+ int WideElts = WideVT.getVectorNumElements();
+ // Shift left to put the original vector in the MSBs of the new size.
+ Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
+ DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
+ // Increase the shift amount to account for the left shift.
+ ShiftAmt += WideElts - NumElts;
+ }
+
+ Res = DAG.getNode(Opcode, DL, WideVT, Res,
+ DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
}
+ Offset += NumElts; // Increment for next iteration.
}
+
MVT ExtVT;
switch (VT.SimpleTy) {
default:
@@ -16594,7 +17042,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
+ ArrayRef<int> OrigMask = SVOp->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
MVT VT = Op.getSimpleValueType();
@@ -16620,8 +17068,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// undef as well. This makes it easier to match the shuffle based solely on
// the mask.
if (V2IsUndef &&
- any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
- SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
+ any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+ SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
for (int &M : NewMask)
if (M >= NumElements)
M = -1;
@@ -16629,15 +17077,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
}
// Check for illegal shuffle mask element index values.
- int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
- assert(llvm::all_of(Mask,
+ int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+ (void)MaskUpperLimit;
+ assert(llvm::all_of(OrigMask,
[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index");
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
- APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
@@ -16645,11 +17094,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// Create an alternative mask with info about zeroable elements.
// Here we do not set undef elements as zeroable.
- SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
if (V2IsZero) {
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
for (int i = 0; i != NumElements; ++i)
- if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
ZeroableMask[i] = SM_SentinelZero;
}
@@ -16664,7 +17113,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
+ if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
Subtarget, DAG))
return Broadcast;
@@ -16700,8 +17149,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
}
// Commute the shuffle if it will improve canonicalization.
- if (canonicalizeShuffleMaskWithCommute(Mask))
- return DAG.getCommutedVectorShuffle(*SVOp);
+ SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
+ if (canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(V1, V2);
+ }
if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
return V;
@@ -16910,7 +17362,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
// Use kshiftr instruction to move to the lower element.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
@@ -17137,8 +17589,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
- N2 = DAG.getIntPtrConstant(1, dl);
- return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
+ DAG.getTargetConstant(1, dl, MVT::i8));
}
}
@@ -17207,14 +17659,14 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// But if optimizing for size and there's a load folding opportunity,
// generate insertps because blendps does not have a 32-bit memory
// operand form.
- N2 = DAG.getIntPtrConstant(1, dl);
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
- return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
+ return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
+ DAG.getTargetConstant(1, dl, MVT::i8));
}
- N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
+ DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
}
// PINSR* works with constant index.
@@ -17300,7 +17752,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
// Shift to the LSB.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
+ DAG.getTargetConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
DAG.getIntPtrConstant(0, dl));
@@ -17841,10 +18293,10 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
std::swap(Op0, Op1);
APInt APIntShiftAmt;
- if (isConstantSplat(Amt, APIntShiftAmt)) {
+ if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
- return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
- Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
+ Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
}
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
@@ -17970,6 +18422,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
+ if (VT == MVT::f128)
+ return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
@@ -18072,6 +18527,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
return Result;
}
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+ return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
/// 64-bit unsigned integer to double expansion.
static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -18126,8 +18591,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3()) {
- // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
+ if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
@@ -18273,7 +18737,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// Low will be bitcasted right away, so do not bother bitcasting back to its
// original type.
Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
- VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
@@ -18281,7 +18745,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// High will be bitcasted right away, so do not bother bitcasting back to
// its original type.
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
- VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
+ VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
} else {
SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -18329,16 +18793,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue N0 = Op.getOperand(0);
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
+ MVT SrcVT = N0.getSimpleValueType();
+ MVT DstVT = Op.getSimpleValueType();
- if (Op.getSimpleValueType().isVector())
+ if (DstVT == MVT::f128)
+ return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
+
+ if (DstVT.isVector())
return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
- MVT SrcVT = N0.getSimpleValueType();
- MVT DstVT = Op.getSimpleValueType();
-
if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
// Conversions from unsigned i32 to f32/f64 are legal,
@@ -18346,6 +18812,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return Op;
}
+ // Promote i32 to i64 and use a signed conversion on 64-bit targets.
+ if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
+ N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);
+ }
+
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
@@ -18579,7 +19051,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
- if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+ if (VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
@@ -18602,10 +19074,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
// Concat upper and lower parts.
//
-
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
-
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
// Short-circuit if we can determine that each 128-bit half is the same value.
@@ -18903,9 +19372,29 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
- // If called by the legalizer just return.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
+ // If we're called by the type legalizer, handle a few cases.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(InVT)) {
+ if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
+ VT.is128BitVector()) {
+ assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+ // The default behavior is to truncate one step, concatenate, and then
+ // truncate the remainder. We'd rather produce two 64-bit results and
+ // concatenate those.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+ Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
+ Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
+ // Otherwise let default legalization handle it.
return SDValue();
+ }
if (VT.getVectorElementType() == MVT::i1)
return LowerTruncateVecI1(Op, DAG, Subtarget);
@@ -18940,6 +19429,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
return V;
+ // Handle truncation of V256 to V128 using shuffles.
+ assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
+
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
@@ -19016,22 +19508,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
}
- // Handle truncation of V256 to V128 using shuffles.
- assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
-
- assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
-
- unsigned NumElems = VT.getVectorNumElements();
- MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
-
- SmallVector<int, 16> MaskVec(NumElems * 2, -1);
- // Prepare truncation shuffle mask
- for (unsigned i = 0; i != NumElems; ++i)
- MaskVec[i] = i * 2;
- In = DAG.getBitcast(NVT, In);
- SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
- DAG.getIntPtrConstant(0, DL));
+ llvm_unreachable("All 256->128 cases should have been handled above!");
}
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
@@ -19041,6 +19518,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
MVT SrcVT = Src.getSimpleValueType();
SDLoc dl(Op);
+ if (SrcVT == MVT::f128) {
+ RTLIB::Libcall LC;
+ if (Op.getOpcode() == ISD::FP_TO_SINT)
+ LC = RTLIB::getFPTOSINT(SrcVT, VT);
+ else
+ LC = RTLIB::getFPTOUINT(SrcVT, VT);
+
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
+ }
+
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
@@ -19075,14 +19563,27 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
- if (!IsSigned && Subtarget.hasAVX512()) {
- // Conversions from f32/f64 should be legal.
- if (UseSSEReg)
+ if (!IsSigned && UseSSEReg) {
+ // Conversions from f32/f64 with AVX512 should be legal.
+ if (Subtarget.hasAVX512())
return Op;
- // Use default expansion.
+ // Use default expansion for i64.
if (VT == MVT::i64)
return SDValue();
+
+ assert(VT == MVT::i32 && "Unexpected VT!");
+
+ // Promote i32 to i64 and use a signed operation on 64-bit targets.
+ if (Subtarget.is64Bit()) {
+ SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ }
+
+ // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
+ // use fisttp which will be handled later.
+ if (!Subtarget.hasSSE3())
+ return SDValue();
}
// Promote i16 to i32 if we can use a SSE operation.
@@ -19103,12 +19604,17 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
-static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT SVT = In.getSimpleValueType();
+ if (VT == MVT::f128) {
+ RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
+ return LowerF128Call(Op, DAG, LC);
+ }
+
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
return DAG.getNode(X86ISD::VFPEXT, DL, VT,
@@ -19116,14 +19622,31 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
In, DAG.getUNDEF(SVT)));
}
-/// Horizontal vector math instructions may be slower than normal math with
-/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
-/// implementation, and likely shuffle complexity of the alternate sequence.
-static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
- bool HasFastHOps = Subtarget.hasFastHorizontalOps();
- return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getSimpleValueType();
+
+ // It's legal except when f128 is involved
+ if (SVT != MVT::f128)
+ return Op;
+
+ RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
+
+ // FP_ROUND node has a second operand indicating whether it is known to be
+ // precise. That doesn't take part in the LibCall so we can't directly use
+ // LowerF128Call.
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
+}
+
+// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
+// the default expansion of STRICT_FP_ROUND.
+static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
+ // FIXME: Need to form a libcall with an input chain for f128.
+ assert(Op.getOperand(0).getValueType() != MVT::f128 &&
+ "Don't know how to handle f128 yet!");
+ return Op;
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -19200,8 +19723,13 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
-static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() == MVT::f128) {
+ RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128
+ : RTLIB::SUB_F128;
+ return LowerF128Call(Op, DAG, LC);
+ }
+
assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
@@ -19358,13 +19886,13 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
SelectionDAG &DAG) {
return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
+ DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
}
/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
/// style scalarized (associative) reduction patterns.
-static bool matchBitOpReduction(SDValue Op, ISD::NodeType BinOp,
- SmallVectorImpl<SDValue> &SrcOps) {
+static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
+ SmallVectorImpl<SDValue> &SrcOps) {
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
@@ -19437,7 +19965,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
return SDValue();
SmallVector<SDValue, 8> VecIns;
- if (!matchBitOpReduction(Op, ISD::OR, VecIns))
+ if (!matchScalarReduction(Op, ISD::OR, VecIns))
return SDValue();
// Quit if not 128/256-bit vector.
@@ -19461,8 +19989,8 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
}
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL,
- MVT::i8);
+ X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
+ DL, MVT::i8);
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
}
@@ -19576,6 +20104,13 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case X86ISD::XOR:
case X86ISD::AND:
return SDValue(Op.getNode(), 1);
+ case ISD::SSUBO:
+ case ISD::USUBO: {
+ // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
+ Op->getOperand(1)).getValue(1);
+ }
default:
default_case:
break;
@@ -19766,6 +20301,63 @@ unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
return 2;
}
+SDValue
+X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N,0); // Lower SDIV as SDIV
+
+ assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
+ "Unexpected divisor!");
+
+ // Only perform this transform if CMOV is supported otherwise the select
+ // below will become a branch.
+ if (!Subtarget.hasCMov())
+ return SDValue();
+
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ // FIXME: Support i8.
+ if (VT != MVT::i16 && VT != MVT::i32 &&
+ !(Subtarget.is64Bit() && VT == MVT::i64))
+ return SDValue();
+
+ unsigned Lg2 = Divisor.countTrailingZeros();
+
+ // If the divisor is 2 or -2, the default expansion is better.
+ if (Lg2 == 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+ SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+
+ // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+ SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+ SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+ Created.push_back(Cmp.getNode());
+ Created.push_back(Add.getNode());
+ Created.push_back(CMov.getNode());
+
+ // Divide by pow2.
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (Divisor.isNonNegative())
+ return SRA;
+
+ Created.push_back(SRA.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+}
+
/// Result of 'and' is compared against zero. Change to a BT node if possible.
/// Returns the BT node and the condition code needed to use it.
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
@@ -19842,8 +20434,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
if (Src.getValueType() != BitNo.getValueType())
BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
- X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
- dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
+ dl, MVT::i8);
return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
}
@@ -19935,13 +20527,6 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- // If this is a seteq make sure any build vectors of all zeros are on the RHS.
- // This helps with vptestm matching.
- // TODO: Should we just canonicalize the setcc during DAG combine?
- if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
- ISD::isBuildVectorAllZeros(Op0.getNode()))
- std::swap(Op0, Op1);
-
// Prefer SETGT over SETLT.
if (SetCCOpcode == ISD::SETLT) {
SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
@@ -20007,7 +20592,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// Only do this pre-AVX since vpcmp* is no longer destructive.
if (Subtarget.hasAVX())
return SDValue();
- SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
+ SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
if (!ULEOp1)
return SDValue();
Op1 = ULEOp1;
@@ -20018,7 +20603,7 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
// This is beneficial because materializing a constant 0 for the PCMPEQ is
// probably cheaper than XOR+PCMPGT using 2 different vector constants:
// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
- SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+ SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
if (!UGEOp1)
return SDValue();
Op1 = Op0;
@@ -20086,14 +20671,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CC0, dl, MVT::i8));
+ DAG.getTargetConstant(CC0, dl, MVT::i8));
SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CC1, dl, MVT::i8));
+ DAG.getTargetConstant(CC1, dl, MVT::i8));
Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
// Handle all other FP comparisons here.
Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(SSECC, dl, MVT::i8));
+ DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
@@ -20106,16 +20691,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
MVT VTOp0 = Op0.getSimpleValueType();
+ (void)VTOp0;
assert(VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!");
assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!");
- // This is being called by type legalization because v2i32 is marked custom
- // for result type legalization for v2f32.
- if (VTOp0 == MVT::v2i32)
- return SDValue();
-
// The non-AVX512 code below works under the assumption that source and
// destination types are the same.
assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
@@ -20153,7 +20734,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
return DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getConstant(CmpMode, dl, MVT::i8));
+ DAG.getTargetConstant(CmpMode, dl, MVT::i8));
}
// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
@@ -20222,21 +20803,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
TLI.isOperationLegal(ISD::UMIN, VT)) {
// If we have a constant operand, increment/decrement it and change the
// condition to avoid an invert.
- if (Cond == ISD::SETUGT &&
- ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
- return !C->getAPIntValue().isMaxValue();
- })) {
+ if (Cond == ISD::SETUGT) {
// X > C --> X >= (C+1) --> X == umax(X, C+1)
- Op1 = DAG.getNode(ISD::ADD, dl, VT, Op1, DAG.getConstant(1, dl, VT));
- Cond = ISD::SETUGE;
+ if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
+ Op1 = UGTOp1;
+ Cond = ISD::SETUGE;
+ }
}
- if (Cond == ISD::SETULT &&
- ISD::matchUnaryPredicate(Op1, [](ConstantSDNode *C) {
- return !C->getAPIntValue().isNullValue();
- })) {
+ if (Cond == ISD::SETULT) {
// X < C --> X <= (C-1) --> X == umin(X, C-1)
- Op1 = DAG.getNode(ISD::SUB, dl, VT, Op1, DAG.getConstant(1, dl, VT));
- Cond = ISD::SETULE;
+ if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
+ Op1 = ULTOp1;
+ Cond = ISD::SETULE;
+ }
}
bool Invert = false;
unsigned Opc;
@@ -20360,11 +20939,11 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
return Result;
}
-// Try to select this as a KORTEST+SETCC if possible.
-static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SDValue &X86CC) {
+// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
+static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue &X86CC) {
// Only support equality comparisons.
if (CC != ISD::SETEQ && CC != ISD::SETNE)
return SDValue();
@@ -20389,6 +20968,21 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
} else
return SDValue();
+ // If the input is an AND, we can combine it's operands into the KTEST.
+ bool KTestable = false;
+ if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
+ KTestable = true;
+ if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
+ KTestable = true;
+ if (!isNullConstant(Op1))
+ KTestable = false;
+ if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
+ SDValue LHS = Op0.getOperand(0);
+ SDValue RHS = Op0.getOperand(1);
+ X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
+ }
+
// If the input is an OR, we can combine it's operands into the KORTEST.
SDValue LHS = Op0;
SDValue RHS = Op0;
@@ -20397,7 +20991,7 @@ static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
RHS = Op0.getOperand(1);
}
- X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
}
@@ -20425,9 +21019,9 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
return PTEST;
}
- // Try to lower using KORTEST.
- if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
- return KORTEST;
+ // Try to lower using KORTEST or KTEST.
+ if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
+ return Test;
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
// these.
@@ -20442,7 +21036,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
if (Invert) {
X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- X86CC = DAG.getConstant(CCode, dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
}
return Op0.getOperand(1);
@@ -20456,7 +21050,7 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
+ X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
}
@@ -20472,6 +21066,19 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ // Handle f128 first, since one possible outcome is a normal integer
+ // comparison which gets handled by emitFlagsForSetcc.
+ if (Op0.getValueType() == MVT::f128) {
+ softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);
+
+ // If softenSetCCOperands returned a scalar, use it.
+ if (!Op1.getNode()) {
+ assert(Op0.getValueType() == Op.getValueType() &&
+ "Unexpected setcc expansion!");
+ return Op0;
+ }
+ }
+
SDValue X86CC;
SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
if (!EFLAGS)
@@ -20612,15 +21219,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
if (Subtarget.hasAVX512()) {
- SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
- CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+ SDValue Cmp =
+ DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
+ DAG.getTargetConstant(SSECC, DL, MVT::i8));
assert(!VT.isVector() && "Not a scalar type?");
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
if (SSECC < 8 || Subtarget.hasAVX()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
- DAG.getConstant(SSECC, DL, MVT::i8));
+ DAG.getTargetConstant(SSECC, DL, MVT::i8));
// If we have AVX, we can use a variable vector select (VBLENDV) instead
// of 3 logic instructions for size savings and potentially speed.
@@ -20718,8 +21326,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
- unsigned CondCode =
- cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+ unsigned CondCode = Cond.getConstantOperandVal(0);
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
@@ -20807,8 +21414,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
- MVT VT = Op.getSimpleValueType();
-
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
!isScalarFPTypeInSSEReg(VT)) // FPStack?
@@ -20826,7 +21431,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode X86Cond;
std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
- CC = DAG.getConstant(X86Cond, DL, MVT::i8);
+ CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
AddTest = false;
}
@@ -20848,7 +21453,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
if (AddTest) {
- CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
X86::COND_NE, DL, DAG);
}
@@ -20864,9 +21469,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(isNullConstant(Op1) || isNullConstant(Op2))) {
- SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- Cond);
+ SDValue Res =
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
return DAG.getNOT(DL, Res, Res.getValueType());
return Res;
@@ -21037,8 +21642,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
if (Subtarget.hasAVX()) {
assert(VT.is256BitVector() && "256-bit vector expected");
- int HalfNumElts = NumElts / 2;
- MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ int HalfNumElts = HalfVT.getVectorNumElements();
unsigned NumSrcElts = InVT.getVectorNumElements();
SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
@@ -21081,7 +21686,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
- DAG.getConstant(SignExtShift, dl, MVT::i8));
+ DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
}
if (VT == MVT::v2i64) {
@@ -21119,7 +21724,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
// Custom legalize v8i8->v8i64 on CPUs without avx512bw.
if (InVT == MVT::v8i8) {
- if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+ if (VT != MVT::v8i64)
return SDValue();
In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
@@ -21138,10 +21743,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
// for v4i32 the high shuffle mask will be {2, 3, -1, -1}
// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
// concat the vectors to original VT
-
- MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() / 2);
-
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
unsigned NumElems = InVT.getVectorNumElements();
@@ -21165,7 +21767,7 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
- if (Store->isVolatile())
+ if (!Store->isSimple())
return SDValue();
MVT StoreVT = StoredVal.getSimpleValueType();
@@ -21201,7 +21803,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
// Splitting volatile memory ops is not allowed unless the operation was not
// legal to begin with. We are assuming the input op is legal (this transform
// is only used for targets with AVX).
- if (Store->isVolatile())
+ if (!Store->isSimple())
return SDValue();
MVT StoreSVT = StoreVT.getScalarType();
@@ -21266,14 +21868,13 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
"Unexpected VT");
- if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
- TargetLowering::TypeWidenVector)
- return SDValue();
+ assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
+ TargetLowering::TypeWidenVector && "Unexpected type action!");
- MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
- StoreVT.getVectorNumElements() * 2);
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
@@ -21313,11 +21914,10 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
- EVT MemVT = Ld->getMemoryVT();
// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
if (RegVT.getVectorElementType() == MVT::i1) {
- assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+ assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI");
@@ -21336,176 +21936,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
}
- // Nothing useful we can do without SSE2 shuffles.
- assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned RegSz = RegVT.getSizeInBits();
-
- ISD::LoadExtType Ext = Ld->getExtensionType();
-
- assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
- && "Only anyext and sext are currently implemented.");
- assert(MemVT != RegVT && "Cannot extend to the same type");
- assert(MemVT.isVector() && "Must load a vector from memory");
-
- unsigned NumElems = RegVT.getVectorNumElements();
- unsigned MemSz = MemVT.getSizeInBits();
- assert(RegSz > MemSz && "Register size must be greater than the mem size");
-
- if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
- // The only way in which we have a legal 256-bit vector result but not the
- // integer 256-bit operations needed to directly lower a sextload is if we
- // have AVX1 but not AVX2. In that case, we can always emit a sextload to
- // a 128-bit vector and a normal sign_extend to 256-bits that should get
- // correctly legalized. We do this late to allow the canonical form of
- // sextload to persist throughout the rest of the DAG combiner -- it wants
- // to fold together any extensions it can, and so will fuse a sign_extend
- // of an sextload into a sextload targeting a wider value.
- SDValue Load;
- if (MemSz == 128) {
- // Just switch this to a normal load.
- assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
- "it must be a legal 128-bit vector "
- "type!");
- Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- } else {
- assert(MemSz < 128 &&
- "Can't extend a type wider than 128 bits to a 256 bit vector!");
- // Do an sext load to a 128-bit vector type. We want to use the same
- // number of elements, but elements half as wide. This will end up being
- // recursively lowered by this routine, but will succeed as we definitely
- // have all the necessary features if we're using AVX1.
- EVT HalfEltVT =
- EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
- EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
- Load =
- DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- }
-
- // Replace chain users with the new chain.
- assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
-
- // Finally, do a normal sign-extend to the desired register.
- SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
- return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
- }
-
- // All sizes must be a power of two.
- assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
- "Non-power-of-two elements are not custom lowered!");
-
- // Attempt to load the original value using scalar loads.
- // Find the largest scalar type that divides the total loaded size.
- MVT SclrLoadTy = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
- SclrLoadTy = Tp;
- }
- }
-
- // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
- if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
- (64 <= MemSz))
- SclrLoadTy = MVT::f64;
-
- // Calculate the number of scalar loads that we need to perform
- // in order to load our vector from memory.
- unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
-
- assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
- "Can only lower sext loads with a single scalar load!");
-
- unsigned loadRegSize = RegSz;
- if (Ext == ISD::SEXTLOAD && RegSz >= 256)
- loadRegSize = 128;
-
- // If we don't have BWI we won't be able to create the shuffle needed for
- // v8i8->v8i64.
- if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
- MemVT == MVT::v8i8)
- loadRegSize = 128;
-
- // Represent our vector as a sequence of elements which are the
- // largest scalar that we can load.
- EVT LoadUnitVecVT = EVT::getVectorVT(
- *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
-
- // Represent the data using the same element type that is stored in
- // memory. In practice, we ''widen'' MemVT.
- EVT WideVecVT =
- EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- loadRegSize / MemVT.getScalarSizeInBits());
-
- assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
- "Invalid vector type");
-
- // We can't shuffle using an illegal type.
- assert(TLI.isTypeLegal(WideVecVT) &&
- "We only lower types that form legal widened vector types");
-
- SmallVector<SDValue, 8> Chains;
- SDValue Ptr = Ld->getBasePtr();
- unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
- SDValue Increment = DAG.getConstant(OffsetInc, dl,
- TLI.getPointerTy(DAG.getDataLayout()));
- SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
-
- unsigned Offset = 0;
- for (unsigned i = 0; i < NumLoads; ++i) {
- unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
-
- // Perform a single load.
- SDValue ScalarLoad =
- DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
- Ld->getPointerInfo().getWithOffset(Offset),
- NewAlign, Ld->getMemOperand()->getFlags());
- Chains.push_back(ScalarLoad.getValue(1));
- // Create the first element type using SCALAR_TO_VECTOR in order to avoid
- // another round of DAGCombining.
- if (i == 0)
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
- else
- Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
- ScalarLoad, DAG.getIntPtrConstant(i, dl));
-
- Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
- Offset += OffsetInc;
- }
-
- SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
-
- // Bitcast the loaded value to a vector of the original element type, in
- // the size of the target vector type.
- SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
- unsigned SizeRatio = RegSz / MemSz;
-
- if (Ext == ISD::SEXTLOAD) {
- SDValue Sext = getExtendInVec(ISD::SIGN_EXTEND, dl, RegVT, SlicedVec, DAG);
- return DAG.getMergeValues({Sext, TF}, dl);
- }
-
- if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
- MemVT == MVT::v8i8) {
- SDValue Sext = getExtendInVec(ISD::ZERO_EXTEND, dl, RegVT, SlicedVec, DAG);
- return DAG.getMergeValues({Sext, TF}, dl);
- }
-
- // Redistribute the loaded elements into the different locations.
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i * SizeRatio] = i;
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(WideVecVT), ShuffleVec);
-
- // Bitcast to the requested type.
- Shuff = DAG.getBitcast(RegVT, Shuff);
- return DAG.getMergeValues({Shuff, TF}, dl);
+ return SDValue();
}
/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
@@ -21610,7 +22041,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (Inverted)
X86Cond = X86::GetOppositeBranchCondition(X86Cond);
- CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
addTest = false;
} else {
unsigned CondOpc;
@@ -21638,10 +22069,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (Cmp == Cond.getOperand(1).getOperand(1) &&
isX86LogicalCmp(Cmp) &&
Op.getNode()->hasOneUse()) {
- X86::CondCode CCode =
- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
- CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, dl, MVT::i8);
+ X86::CondCode CCode0 =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode0 = X86::GetOppositeBranchCondition(CCode0);
+ CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
SDNode *User = *Op.getNode()->use_begin();
// Look for an unconditional branch following this conditional branch.
// We need this because we need to reverse the successors in order
@@ -21654,12 +22085,12 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
(void)NewBR;
Dest = FalseBB;
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- X86::CondCode CCode =
- (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
- CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
+ Dest, CC, Cmp);
+ X86::CondCode CCode1 =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode1 = X86::GetOppositeBranchCondition(CCode1);
+ CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -21672,7 +22103,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
X86::CondCode CCode =
(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getConstant(CCode, dl, MVT::i8);
+ CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
Cond = Cond.getOperand(0).getOperand(1);
addTest = false;
} else if (Cond.getOpcode() == ISD::SETCC &&
@@ -21698,10 +22129,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
- CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -21714,10 +22145,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
Cond.getOperand(0), Cond.getOperand(1));
Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
Chain, Dest, CC, Cmp);
- CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
Cond = Cmp;
addTest = false;
}
@@ -21742,7 +22173,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
- CC = DAG.getConstant(X86Cond, dl, MVT::i8);
+ CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
X86Cond, dl, DAG);
}
@@ -21770,7 +22201,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ unsigned Align = Op.getConstantOperandVal(2);
EVT VT = Node->getValueType(0);
// Chain the dynamic stack allocation so that it doesn't modify the stack
@@ -21811,7 +22242,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
}
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
- unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
DAG.getRegister(Vreg, SPTy));
@@ -21821,7 +22252,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned SPReg = RegInfo->getStackRegister();
+ Register SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
@@ -22076,7 +22507,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
}
return DAG.getNode(Opc, dl, VT, SrcOp,
- DAG.getConstant(ShiftAmt, dl, MVT::i8));
+ DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
}
/// Handle vector element shifts where the shift amount may or may not be a
@@ -22121,7 +22552,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
MVT::v2i64, ShAmt);
else {
- SDValue ByteShift = DAG.getConstant(
+ SDValue ByteShift = DAG.getTargetConstant(
(128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
@@ -22308,13 +22739,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
- return C->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
return false;
};
auto isRoundModeSAE = [](SDValue Rnd) {
- if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
- return C->getZExtValue() == X86::STATIC_ROUNDING::NO_EXC;
+ if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
+ unsigned RC = C->getZExtValue();
+ if (RC & X86::STATIC_ROUNDING::NO_EXC) {
+ // Clear the NO_EXC bit and check remaining bits.
+ RC ^= X86::STATIC_ROUNDING::NO_EXC;
+ // As a convenience we allow no other bits or explicitly
+ // current direction.
+ return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
+ }
+ }
return false;
};
@@ -22335,7 +22774,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
};
SDLoc dl(Op);
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned IntNo = Op.getConstantOperandVal(0);
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
@@ -22411,9 +22850,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
- if (IntrData->Type == INTR_TYPE_3OP_IMM8)
- Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
-
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -22666,7 +23102,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_CC: {
MVT MaskVT = Op.getSimpleValueType();
SDValue CC = Op.getOperand(3);
- CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
@@ -22685,7 +23120,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
- SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+ SDValue CC = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
SDValue Cmp;
@@ -22750,16 +23185,16 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case COMI_RM: { // Comparison intrinsics with Sae
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ unsigned CondVal = Op.getConstantOperandVal(3);
SDValue Sae = Op.getOperand(4);
SDValue FCmp;
if (isRoundModeCurDirection(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8));
+ DAG.getTargetConstant(CondVal, dl, MVT::i8));
else if (isRoundModeSAE(Sae))
FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
- DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+ DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
else
return SDValue();
// Need to fill with zeros to ensure the bitcast will produce zeroes
@@ -22819,9 +23254,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
- SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
- Op.getOperand(2),
- DAG.getConstant(0xf, dl, MVT::i32));
+ auto Round = cast<ConstantSDNode>(Op.getOperand(2));
+ SDValue RoundingMode =
+ DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), RoundingMode);
}
@@ -22829,12 +23264,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
// Clear the upper bits of the rounding immediate so that the legacy
// intrinsic can't trigger the scaling behavior of VRNDSCALE.
- SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
- Op.getOperand(3),
- DAG.getConstant(0xf, dl, MVT::i32));
+ auto Round = cast<ConstantSDNode>(Op.getOperand(3));
+ SDValue RoundingMode =
+ DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
+ case BEXTRI: {
+ assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode");
+
+ // The control is a TargetConstant, but we need to convert it to a
+ // ConstantSDNode.
+ uint64_t Imm = Op.getConstantOperandVal(2);
+ SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Control);
+ }
// ADC/ADCX/SBB
case ADX: {
SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
@@ -23165,6 +23610,61 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
MaskVT, Operation);
return DAG.getMergeValues({Result0, Result1}, DL);
}
+ case Intrinsic::x86_mmx_pslli_w:
+ case Intrinsic::x86_mmx_pslli_d:
+ case Intrinsic::x86_mmx_pslli_q:
+ case Intrinsic::x86_mmx_psrli_w:
+ case Intrinsic::x86_mmx_psrli_d:
+ case Intrinsic::x86_mmx_psrli_q:
+ case Intrinsic::x86_mmx_psrai_w:
+ case Intrinsic::x86_mmx_psrai_d: {
+ SDLoc DL(Op);
+ SDValue ShAmt = Op.getOperand(2);
+ // If the argument is a constant, convert it to a target constant.
+ if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
+ ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ Op.getOperand(0), Op.getOperand(1), ShAmt);
+ }
+
+ unsigned NewIntrinsic;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_mmx_pslli_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+ break;
+ case Intrinsic::x86_mmx_pslli_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+ break;
+ case Intrinsic::x86_mmx_pslli_q:
+ NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+ break;
+ case Intrinsic::x86_mmx_psrli_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+ break;
+ case Intrinsic::x86_mmx_psrli_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+ break;
+ case Intrinsic::x86_mmx_psrli_q:
+ NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+ break;
+ case Intrinsic::x86_mmx_psrai_w:
+ NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+ break;
+ case Intrinsic::x86_mmx_psrai_d:
+ NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+ break;
+ }
+
+ // The vector shift intrinsics with scalars uses 32b shift amounts but
+ // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
+ // MMX register.
+ ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+ DAG.getConstant(NewIntrinsic, DL, MVT::i32),
+ Op.getOperand(1), ShAmt);
+
+ }
}
}
@@ -23177,7 +23677,9 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
@@ -23204,7 +23706,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
VT.getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -23238,7 +23742,9 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
Src.getSimpleValueType().getVectorNumElements());
MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
@@ -23266,7 +23772,9 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
// Scale must be constant.
if (!C)
return SDValue();
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
+ TLI.getPointerTy(DAG.getDataLayout()));
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
MVT MaskVT =
@@ -23435,8 +23943,7 @@ EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
+ unsigned IntNo = Op.getConstantOperandVal(1);
const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
switch (IntNo) {
@@ -23538,10 +24045,10 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
- SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
- DAG.getConstant(1, dl, Op->getValueType(1)),
- DAG.getConstant(X86::COND_B, dl, MVT::i8),
- SDValue(Result.getNode(), 1) };
+ SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, dl, Op->getValueType(1)),
+ DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
+ SDValue(Result.getNode(), 1)};
SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
// Return { result, isValid, chain }.
@@ -23581,8 +24088,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
Scale, Chain, Subtarget);
}
case PREFETCH: {
- SDValue Hint = Op.getOperand(6);
- unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
+ const APInt &HintVal = Op.getConstantOperandAPInt(6);
assert((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3");
unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
@@ -23678,7 +24184,7 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
return SDValue();
- unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned Depth = Op.getConstantOperandVal(0);
SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -23730,7 +24236,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
unsigned FrameReg =
RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
- unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned Depth = Op.getConstantOperandVal(0);
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
@@ -23743,12 +24249,11 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
+Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const MachineFunction &MF = DAG.getMachineFunction();
- unsigned Reg = StringSwitch<unsigned>(RegName)
+ Register Reg = StringSwitch<unsigned>(RegName)
.Case("esp", X86::ESP)
.Case("rsp", X86::RSP)
.Case("ebp", X86::EBP)
@@ -23762,8 +24267,7 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
#ifndef NDEBUG
else {
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned FrameReg =
- RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
+ Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
"Invalid Frame Register!");
}
@@ -23809,7 +24313,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
+ Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
"Invalid Frame Register!");
@@ -23967,6 +24471,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
case CallingConv::Fast:
+ case CallingConv::Tail:
// Pass 'nest' parameter in EAX.
// Must be kept in sync with X86CallingConv.td
NestReg = X86::EAX;
@@ -24279,12 +24784,9 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
if (Opc == ISD::CTLZ) {
// If src is zero (i.e. bsr sets ZF), returns NumBits.
- SDValue Ops[] = {
- Op,
- DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
- DAG.getConstant(X86::COND_E, dl, MVT::i8),
- Op.getValue(1)
- };
+ SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
+ DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)};
Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
}
@@ -24312,12 +24814,9 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
// If src is zero (i.e. bsf sets ZF), returns NumBits.
- SDValue Ops[] = {
- Op,
- DAG.getConstant(NumBits, dl, VT),
- DAG.getConstant(X86::COND_E, dl, MVT::i8),
- Op.getValue(1)
- };
+ SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
+ DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
+ Op.getValue(1)};
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
@@ -24453,7 +24952,7 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
SDValue N0 = Op.getOperand(0);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
DAG.getConstant(0, DL, VT), N0);
- SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
SDValue(Neg.getNode(), 1)};
return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
}
@@ -25033,7 +25532,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
// Optimize shl/srl/sra with constant shift amount.
APInt APIntShiftAmt;
- if (!isConstantSplat(Amt, APIntShiftAmt))
+ if (!X86::isConstantSplat(Amt, APIntShiftAmt))
return SDValue();
// If the shift amount is out of range, return undef.
@@ -25220,7 +25719,7 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
}
ConstantSDNode *ND = cast<ConstantSDNode>(Op);
- APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
+ APInt C(SVTBits, ND->getZExtValue());
uint64_t ShAmt = C.getZExtValue();
if (ShAmt >= SVTBits) {
Elts.push_back(DAG.getUNDEF(SVT));
@@ -25502,7 +26001,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
(VT == MVT::v32i8 && Subtarget.hasInt256())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
- SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);
+ SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
// Extend constant shift amount to vXi16 (it doesn't matter if the type
// isn't legal).
@@ -25774,7 +26273,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
return DAG.getNode(Op, DL, VT, R,
- DAG.getConstant(RotateAmt, DL, MVT::i8));
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
}
// Else, fall-back on VPROLV/VPRORV.
@@ -25795,7 +26294,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
if (0 <= CstSplatIndex) {
uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
- DAG.getConstant(RotateAmt, DL, MVT::i8));
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
}
// Use general rotate by variable (per-element).
@@ -26032,7 +26531,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// If this is a canonical idempotent atomicrmw w/no uses, we have a better
// lowering available in lowerAtomicArith.
- // TODO: push more cases through this path.
+ // TODO: push more cases through this path.
if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
AI->use_empty())
@@ -26087,10 +26586,22 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
return Loaded;
}
+bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
+ if (!SI.isUnordered())
+ return false;
+ return ExperimentalUnorderedISEL;
+}
+bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
+ if (!LI.isUnordered())
+ return false;
+ return ExperimentalUnorderedISEL;
+}
+
+
/// Emit a locked operation on a stack location which does not change any
/// memory location, but does involve a lock prefix. Location is chosen to be
/// a) very likely accessed only by a single thread to minimize cache traffic,
-/// and b) definitely dereferenceable. Returns the new Chain result.
+/// and b) definitely dereferenceable. Returns the new Chain result.
static SDValue emitLockedStackOp(SelectionDAG &DAG,
const X86Subtarget &Subtarget,
SDValue Chain, SDLoc DL) {
@@ -26099,22 +26610,22 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
// operations issued by the current processor. As such, the location
// referenced is not relevant for the ordering properties of the instruction.
// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
- // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
+ // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
// 2) Using an immediate operand appears to be the best encoding choice
// here since it doesn't require an extra register.
// 3) OR appears to be very slightly faster than ADD. (Though, the difference
// is small enough it might just be measurement noise.)
// 4) When choosing offsets, there are several contributing factors:
// a) If there's no redzone, we default to TOS. (We could allocate a cache
- // line aligned stack object to improve this case.)
+ // line aligned stack object to improve this case.)
// b) To minimize our chances of introducing a false dependence, we prefer
- // to offset the stack usage from TOS slightly.
+ // to offset the stack usage from TOS slightly.
// c) To minimize concerns about cross thread stack usage - in particular,
// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
// captures state in the TOS frame and accesses it from many threads -
// we want to use an offset such that the offset is in a distinct cache
// line from the TOS frame.
- //
+ //
// For a general discussion of the tradeoffs and benchmark results, see:
// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
@@ -26155,10 +26666,10 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
- AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
- cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
- SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
- cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+ AtomicOrdering FenceOrdering =
+ static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
+ SyncScope::ID FenceSSID =
+ static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
@@ -26167,7 +26678,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
- SDValue Chain = Op.getOperand(0);
+ SDValue Chain = Op.getOperand(0);
return emitLockedStackOp(DAG, Subtarget, Chain, dl);
}
@@ -26218,6 +26729,17 @@ static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT InVT = V.getSimpleValueType();
+ if (InVT == MVT::v64i8) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
+ Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
+ Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
+ Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
+ DAG.getConstant(32, DL, MVT::i8));
+ return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
+ }
if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
@@ -26258,8 +26780,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
- EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
- DstVT.getVectorNumElements() / 2);
+ MVT CastVT = DstVT.getHalfNumVectorElementsVT();
Lo = DAG.getBitcast(CastVT, Lo);
Hi = DAG.getBitcast(CastVT, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
@@ -26275,53 +26796,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getZExtOrTrunc(V, DL, DstVT);
}
- if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
- SrcVT == MVT::i64) {
- assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
- !(DstVT == MVT::x86mmx && SrcVT.isVector()))
- // This conversion needs to be expanded.
- return SDValue();
+ assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
+ SrcVT == MVT::i64) && "Unexpected VT!");
- SDLoc dl(Op);
- if (SrcVT.isVector()) {
- // Widen the vector in input in the case of MVT::v2i32.
- // Example: from MVT::v2i32 to MVT::v4i32.
- MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
- SrcVT.getVectorNumElements() * 2);
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
- DAG.getUNDEF(SrcVT));
- } else {
- assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
- "Unexpected source type in LowerBITCAST");
- Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
- }
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
+ !(DstVT == MVT::x86mmx && SrcVT.isVector()))
+ // This conversion needs to be expanded.
+ return SDValue();
- MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
- Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
+ SDLoc dl(Op);
+ if (SrcVT.isVector()) {
+ // Widen the vector in input in the case of MVT::v2i32.
+ // Example: from MVT::v2i32 to MVT::v4i32.
+ MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
+ SrcVT.getVectorNumElements() * 2);
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
+ DAG.getUNDEF(SrcVT));
+ } else {
+ assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
+ "Unexpected source type in LowerBITCAST");
+ Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+ }
- if (DstVT == MVT::x86mmx)
- return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
+ MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
+ Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
- DAG.getIntPtrConstant(0, dl));
- }
+ if (DstVT == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
- assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
- Subtarget.hasMMX() && "Unexpected custom BITCAST");
- assert((DstVT == MVT::i64 ||
- (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
- "Unexpected custom BITCAST");
- // i64 <=> MMX conversions are Legal.
- if (SrcVT==MVT::i64 && DstVT.isVector())
- return Op;
- if (DstVT==MVT::i64 && SrcVT.isVector())
- return Op;
- // MMX <=> MMX conversions are Legal.
- if (SrcVT.isVector() && DstVT.isVector())
- return Op;
- // All other conversions need to be expanded.
- return SDValue();
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
+ DAG.getIntPtrConstant(0, dl));
}
/// Compute the horizontal sum of bytes in V for the elements of VT.
@@ -26549,6 +27054,13 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
+ // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
+ // lowering.
+ if (VT == MVT::v8i64 || VT == MVT::v16i32) {
+ assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
+ return Lower512IntUnary(Op, DAG);
+ }
+
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported");
@@ -26656,12 +27168,12 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
// seq_cst which isn't SingleThread, everything just needs to be preserved
// during codegen and then dropped. Note that we expect (but don't assume),
// that orderings other than seq_cst and acq_rel have been canonicalized to
- // a store or load.
+ // a store or load.
if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
AN->getSyncScopeID() == SyncScope::System) {
// Prefer a locked operation against a stack location to minimize cache
// traffic. This assumes that stack locations are very likely to be
- // accessed only by the owning thread.
+ // accessed only by the owning thread.
SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
@@ -26886,12 +27398,13 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Chain = N->getChain();
SDValue BasePtr = N->getBasePtr();
- if (VT == MVT::v2f32) {
+ if (VT == MVT::v2f32 || VT == MVT::v2i32) {
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
// If the index is v2i64 and we have VLX we can use xmm for data and index.
if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
- DAG.getUNDEF(MVT::v2f32));
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
@@ -26901,30 +27414,6 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
return SDValue();
}
- if (VT == MVT::v2i32) {
- assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
- DAG.getUNDEF(MVT::v2i32));
- // If the index is v2i64 and we have VLX we can use xmm for data and index.
- if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
- SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
- SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- return SDValue(NewScatter.getNode(), 1);
- }
- // Custom widen all the operands to avoid promotion.
- EVT NewIndexVT = EVT::getVectorVT(
- *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
- Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
- DAG.getUNDEF(Index.getValueType()));
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
- DAG.getConstant(0, dl, MVT::v2i1));
- SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
- Ops, N->getMemOperand());
- }
-
MVT IndexVT = Index.getSimpleValueType();
MVT MaskVT = Mask.getSimpleValueType();
@@ -27160,6 +27649,13 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
return NOOP;
}
+SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const {
+ SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+ MakeLibCallOptions CallOptions;
+ return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
+}
+
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -27206,10 +27702,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
- case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget);
+ case ISD::FSUB: return lowerFaddFsub(Op, DAG);
+ case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+ case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -27347,37 +27847,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::MUL: {
EVT VT = N->getValueType(0);
- assert(VT.isVector() && "Unexpected VT");
- if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
- VT.getVectorNumElements() == 2) {
- // Promote to a pattern that will be turned into PMULUDQ.
- SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
- N->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
- N->getOperand(1));
- SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
- Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
- } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
- VT.getVectorElementType() == MVT::i8) {
- // Pre-promote these to vXi16 to avoid op legalization thinking all 16
- // elements are needed.
- MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
- SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
- SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
- SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
- Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
- unsigned NumConcats = 16 / VT.getVectorNumElements();
- SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
- ConcatOps[0] = Res;
- Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
- Results.push_back(Res);
- }
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
+ // Pre-promote these to vXi16 to avoid op legalization thinking all 16
+ // elements are needed.
+ MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
+ SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ unsigned NumConcats = 16 / VT.getVectorNumElements();
+ SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
+ ConcatOps[0] = Res;
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
+ Results.push_back(Res);
return;
}
- case ISD::UADDSAT:
- case ISD::SADDSAT:
- case ISD::USUBSAT:
- case ISD::SSUBSAT:
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
// Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
@@ -27388,6 +27873,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT InVT = N->getOperand(0).getValueType();
assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
"Expected a VT that divides into 128 bits.");
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
unsigned NumConcat = 128 / InVT.getSizeInBits();
EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
@@ -27404,9 +27891,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
- DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
@@ -27435,26 +27919,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Hi);
return;
}
- case ISD::SETCC: {
- // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
- // setCC result type is v2i1 because type legalzation will end up with
- // a v4i1 setcc plus an extend.
- assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
- if (N->getOperand(0).getValueType() != MVT::v2f32 ||
- getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
- return;
- SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
- SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
- N->getOperand(0), UNDEF);
- SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
- N->getOperand(1), UNDEF);
- SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
- N->getOperand(2));
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- return;
- }
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
case X86ISD::FMINC:
case X86ISD::FMIN:
@@ -27475,7 +27939,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::SREM:
case ISD::UREM: {
EVT VT = N->getValueType(0);
- if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
+ if (VT.isVector()) {
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
// If this RHS is a constant splat vector we can widen this and let
// division/remainder by constant optimize it.
// TODO: Can we do something for non-splat?
@@ -27493,17 +27959,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if (VT == MVT::v2i32) {
- // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
- // v2i64 and unroll later. But then we create i64 scalar ops which
- // might be slow in 64-bit mode or require a libcall in 32-bit mode.
- Results.push_back(DAG.UnrollVectorOp(N));
- return;
- }
-
- if (VT.isVector())
- return;
-
LLVM_FALLTHROUGH;
}
case ISD::SDIVREM:
@@ -27561,58 +28016,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
}
- return;
- }
- case ISD::SIGN_EXTEND_VECTOR_INREG: {
- if (ExperimentalVectorWideningLegalization)
- return;
-
- EVT VT = N->getValueType(0);
- SDValue In = N->getOperand(0);
- EVT InVT = In.getValueType();
- if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
- // Custom split this so we can extend i8/i16->i32 invec. This is better
- // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
- // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
- // we allow the sra from the extend to i32 to be shared by the split.
- EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
- InVT.getVectorElementType(),
- InVT.getVectorNumElements() / 2);
- MVT ExtendVT = MVT::getVectorVT(MVT::i32,
- VT.getVectorNumElements());
- In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
- In, DAG.getIntPtrConstant(0, dl));
- In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
-
- // Fill a vector with sign bits for each element.
- SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
- SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
-
- EVT LoVT, HiVT;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
- // Create an unpackl and unpackh to interleave the sign bits then bitcast
- // to vXi64.
- SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
- Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
- SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
- Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+ if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
+ getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
+ isTypeLegal(MVT::v4i64)) {
+ // Input needs to be split and output needs to widened. Let's use two
+ // VTRUNCs, and shuffle their results together into the wider type.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
+ Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
+ SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
+ { 0, 1, 2, 3, 16, 17, 18, 19,
+ -1, -1, -1, -1, -1, -1, -1, -1 });
Results.push_back(Res);
return;
}
+
return;
}
+ case ISD::ANY_EXTEND:
+ // Right now, only MVT::v8i8 has Custom action for an illegal type.
+ // It's intended to custom handle the input type.
+ assert(N->getValueType(0) == MVT::v8i8 &&
+ "Do not know how to legalize this Node");
+ return;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8) &&
- getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector) {
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+ assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
+ "Unexpected type action!");
assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
// Custom split this so we can extend i8/i16->i32 invec. This is better
// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
@@ -27683,27 +28120,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Src = N->getOperand(0);
EVT SrcVT = Src.getValueType();
- // Promote these manually to avoid over promotion to v2i64. Type
- // legalization will revisit the v2i32 operation for more cleanup.
- if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
- getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
- // AVX512DQ provides instructions that produce a v2i64 result.
- if (Subtarget.hasDQI())
- return;
-
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
- Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
- : ISD::AssertSext,
- dl, MVT::v2i32, Res,
- DAG.getValueType(VT.getVectorElementType()));
- Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
- Results.push_back(Res);
- return;
- }
-
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
// Try to create a 128 bit vector, but don't exceed a 32 bit element.
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
@@ -27738,35 +28157,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert((IsSigned || Subtarget.hasAVX512()) &&
"Can only handle signed conversion without AVX512");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
- bool Widenv2i32 =
- getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
if (Src.getValueType() == MVT::v2f64) {
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
if (!IsSigned && !Subtarget.hasVLX()) {
- // If v2i32 is widened, we can defer to the generic legalizer.
- if (Widenv2i32)
- return;
- // Custom widen by doubling to a legal vector with. Isel will
- // further widen to v8f64.
- Opc = ISD::FP_TO_UINT;
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
- Src, DAG.getUNDEF(MVT::v2f64));
+ // If we have VLX we can emit a target specific FP_TO_UINT node,
+ // otherwise we can defer to the generic legalizer which will widen
+ // the input as well. This will be further widened during op
+ // legalization to v8i32<-v8f64.
+ return;
}
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
- if (!Widenv2i32)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- return;
- }
- if (SrcVT == MVT::v2f32 &&
- getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
- SDValue Idx = DAG.getIntPtrConstant(0, dl);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
- DAG.getUNDEF(MVT::v2f32));
- Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
- : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
Results.push_back(Res);
return;
}
@@ -27776,6 +28178,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
+ assert(!VT.isVector() && "Vectors should have been handled above!");
+
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
@@ -27847,7 +28251,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::INTRINSIC_W_CHAIN: {
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ unsigned IntNo = N->getConstantOperandVal(1);
switch (IntNo) {
default : llvm_unreachable("Do not know how to custom type "
"legalize this intrinsic operation!");
@@ -27905,7 +28309,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
SDValue Result;
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
- unsigned BasePtr = TRI->getBaseRegister();
+ Register BasePtr = TRI->getBaseRegister();
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
(BasePtr == X86::RBX || BasePtr == X86::EBX)) {
@@ -28060,34 +28464,33 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- if (SrcVT != MVT::f64 ||
- (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
- getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
+ if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+ assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
+ "Unexpected type action!");
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
+ SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+ Results.push_back(Res);
return;
+ }
- unsigned NumElts = DstVT.getVectorNumElements();
- EVT SVT = DstVT.getVectorElementType();
- EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
- SDValue Res;
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
- Res = DAG.getBitcast(WiderVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
return;
}
case ISD::MGATHER: {
EVT VT = N->getValueType(0);
- if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
+ (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
if (Index.getValueType() != MVT::v2i64)
return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
Gather->getPassThru(),
- DAG.getUNDEF(MVT::v2f32));
+ DAG.getUNDEF(VT));
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
@@ -28098,66 +28501,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
+ DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
Results.push_back(Res.getValue(2));
return;
}
- if (VT == MVT::v2i32) {
- auto *Gather = cast<MaskedGatherSDNode>(N);
- SDValue Index = Gather->getIndex();
- SDValue Mask = Gather->getMask();
- assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
- SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
- Gather->getPassThru(),
- DAG.getUNDEF(MVT::v2i32));
- // If the index is v2i64 we can use it directly.
- if (Index.getValueType() == MVT::v2i64 &&
- (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
- if (!Subtarget.hasVLX()) {
- // We need to widen the mask, but the instruction will only use 2
- // of its elements. So we can use undef.
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
- DAG.getUNDEF(MVT::v2i1));
- Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
- }
- SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
- Gather->getBasePtr(), Index, Gather->getScale() };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
- Gather->getMemoryVT(), Gather->getMemOperand());
- SDValue Chain = Res.getValue(2);
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- Results.push_back(Chain);
- return;
- }
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
- EVT IndexVT = Index.getValueType();
- EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
- IndexVT.getScalarType(), 4);
- // Otherwise we need to custom widen everything to avoid promotion.
- Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
- DAG.getUNDEF(IndexVT));
- Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
- DAG.getConstant(0, dl, MVT::v2i1));
- SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
- Gather->getBasePtr(), Index, Gather->getScale() };
- SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
- Gather->getMemoryVT(), dl, Ops,
- Gather->getMemOperand());
- SDValue Chain = Res.getValue(1);
- if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
- DAG.getIntPtrConstant(0, dl));
- Results.push_back(Res);
- Results.push_back(Chain);
- return;
- }
- }
return;
}
case ISD::LOAD: {
@@ -28166,8 +28515,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// cast since type legalization will try to use an i64 load.
MVT VT = N->getSimpleValueType(0);
assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
- if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
- return;
+ assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
+ "Unexpected type action!");
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
@@ -28177,11 +28526,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
- MVT WideVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
- MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
- Res = DAG.getBitcast(CastVT, Res);
+ MVT VecVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
+ EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
+ Res = DAG.getBitcast(WideVT, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
@@ -28236,6 +28584,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
case X86ISD::Wrapper: return "X86ISD::Wrapper";
case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";
case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
@@ -28373,6 +28722,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
@@ -28737,6 +29087,9 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
}
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
+ return false;
+
EVT SrcVT = ExtVal.getOperand(0).getValueType();
// There is no extending load for vXi1.
@@ -28856,10 +29209,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
- unsigned mainDstReg = MRI.createVirtualRegister(RC);
- unsigned fallDstReg = MRI.createVirtualRegister(RC);
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register fallDstReg = MRI.createVirtualRegister(RC);
// thisMBB:
// xbegin fallMBB
@@ -28913,7 +29266,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
static_assert(X86::AddrNumOperands == 5,
"VAARG_64 assumes 5 address operands");
- unsigned DestReg = MI.getOperand(0).getReg();
+ Register DestReg = MI.getOperand(0).getReg();
MachineOperand &Base = MI.getOperand(1);
MachineOperand &Scale = MI.getOperand(2);
MachineOperand &Index = MI.getOperand(3);
@@ -29049,7 +29402,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
assert(OffsetReg != 0);
// Read the reg_save_area address.
- unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+ Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
.add(Base)
.add(Scale)
@@ -29059,8 +29412,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.setMemRefs(LoadOnlyMMO);
// Zero-extend the offset
- unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
- BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+ Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+ BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
.addImm(0)
.addReg(OffsetReg)
.addImm(X86::sub_32bit);
@@ -29071,7 +29424,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
.addReg(RegSaveReg);
// Compute the offset for the next argument
- unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+ Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
.addReg(OffsetReg)
.addImm(UseFPOffset ? 16 : 8);
@@ -29096,7 +29449,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
//
// Load the overflow_area address into a register.
- unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
.add(Base)
.add(Scale)
@@ -29110,7 +29463,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
if (NeedsAlign) {
// Align the overflow address
assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
- unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+ Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
@@ -29127,7 +29480,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// Compute the next overflow address after this argument.
// (the overflow address should be kept 8-byte aligned)
- unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+ Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
.addReg(OverflowDestReg)
.addImm(ArgSizeA8);
@@ -29191,7 +29544,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
- unsigned CountReg = MI.getOperand(0).getReg();
+ Register CountReg = MI.getOperand(0).getReg();
int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
@@ -29273,7 +29626,9 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
static bool isCMOVPseudo(MachineInstr &MI) {
switch (MI.getOpcode()) {
case X86::CMOV_FR32:
+ case X86::CMOV_FR32X:
case X86::CMOV_FR64:
+ case X86::CMOV_FR64X:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
@@ -29326,9 +29681,9 @@ static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
MachineInstrBuilder MIB;
for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
- unsigned DestReg = MIIt->getOperand(0).getReg();
- unsigned Op1Reg = MIIt->getOperand(1).getReg();
- unsigned Op2Reg = MIIt->getOperand(2).getReg();
+ Register DestReg = MIIt->getOperand(0).getReg();
+ Register Op1Reg = MIIt->getOperand(1).getReg();
+ Register Op2Reg = MIIt->getOperand(2).getReg();
// If this CMOV we are generating is the opposite condition from
// the jump we generated, then we have to swap the operands for the
@@ -29486,9 +29841,9 @@ X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
// SinkMBB:
// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
- unsigned DestReg = FirstCMOV.getOperand(0).getReg();
- unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
- unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
+ Register DestReg = FirstCMOV.getOperand(0).getReg();
+ Register Op1Reg = FirstCMOV.getOperand(1).getReg();
+ Register Op2Reg = FirstCMOV.getOperand(2).getReg();
MachineInstrBuilder MIB =
BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
.addReg(Op1Reg)
@@ -30006,7 +30361,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
// call the retpoline thunk.
DebugLoc DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
- unsigned CalleeVReg = MI.getOperand(0).getReg();
+ Register CalleeVReg = MI.getOperand(0).getReg();
unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
// Find an available scratch register to hold the callee. On 64-bit, we can
@@ -30079,7 +30434,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
// Initialize a register with zero.
MVT PVT = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
- unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+ Register ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
.addDef(ZReg)
@@ -30087,7 +30442,7 @@ void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
- unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
@@ -30131,8 +30486,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
(void)TRI;
- unsigned mainDstReg = MRI.createVirtualRegister(RC);
- unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+ Register mainDstReg = MRI.createVirtualRegister(RC);
+ Register restoreDstReg = MRI.createVirtualRegister(RC);
MemOpndSlot = CurOp;
@@ -30246,8 +30601,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
- unsigned FramePtr = RegInfo->getFrameRegister(*MF);
- unsigned BasePtr = RegInfo->getBaseRegister();
+ Register FramePtr = RegInfo->getFrameRegister(*MF);
+ Register BasePtr = RegInfo->getBaseRegister();
unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
FramePtr, true, X86FI->getRestoreBasePointerOffset())
@@ -30329,7 +30684,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MBB->addSuccessor(checkSspMBB);
// Initialize a register with zero.
- unsigned ZReg = MRI.createVirtualRegister(PtrRC);
+ Register ZReg = MRI.createVirtualRegister(PtrRC);
unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
.addDef(ZReg)
@@ -30337,7 +30692,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
.addReg(ZReg, RegState::Undef);
// Read the current SSP Register value to the zeroed register.
- unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
+ Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
@@ -30352,7 +30707,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
checkSspMBB->addSuccessor(fallMBB);
// Reload the previously saved SSP register value.
- unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
+ Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
const int64_t SPPOffset = 3 * PVT.getStoreSize();
MachineInstrBuilder MIB =
@@ -30370,7 +30725,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MIB.setMemRefs(MMOs);
// Subtract the current SSP from the previous SSP.
- unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
+ Register SspSubReg = MRI.createVirtualRegister(PtrRC);
unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
.addReg(PrevSSPReg)
@@ -30384,7 +30739,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
- unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
+ Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
.addReg(SspSubReg)
.addImm(Offset);
@@ -30394,7 +30749,7 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
// Reset the lower 8 bits.
- unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
+ Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
.addReg(SspFirstShrReg)
.addImm(8);
@@ -30406,12 +30761,12 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
// Do a single shift left.
unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
- unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
+ Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
.addReg(SspSecondShrReg);
// Save the value 128 to a register (will be used next with incssp).
- unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
+ Register Value128InReg = MRI.createVirtualRegister(PtrRC);
unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
.addImm(128);
@@ -30419,8 +30774,8 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
// Since incssp only looks at the lower 8 bits, we might need to do several
// iterations of incssp until we finish fixing the shadow stack.
- unsigned DecReg = MRI.createVirtualRegister(PtrRC);
- unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
+ Register DecReg = MRI.createVirtualRegister(PtrRC);
+ Register CounterReg = MRI.createVirtualRegister(PtrRC);
BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
.addReg(SspAfterShlReg)
.addMBB(fixShadowLoopPrepareMBB)
@@ -30460,11 +30815,11 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
const TargetRegisterClass *RC =
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
- unsigned Tmp = MRI.createVirtualRegister(RC);
+ Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
- unsigned SP = RegInfo->getStackRegister();
+ Register SP = RegInfo->getStackRegister();
MachineInstrBuilder MIB;
@@ -30662,8 +31017,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
MFI->setRestoreBasePointer(MF);
- unsigned FP = RI.getFrameRegister(*MF);
- unsigned BP = RI.getBaseRegister();
+ Register FP = RI.getFrameRegister(*MF);
+ Register BP = RI.getBaseRegister();
unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
MFI->getRestoreBasePointerOffset())
@@ -30674,7 +31029,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
}
// IReg is used as an index in a memory operand and therefore can't be SP
- unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
+ Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
Subtarget.is64Bit() ? 8 : 4);
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
@@ -30683,8 +31038,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
if (Subtarget.is64Bit()) {
- unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
- unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+ Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
// leaq .LJTI0_0(%rip), BReg
BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
@@ -30710,9 +31065,9 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(0);
break;
case MachineJumpTableInfo::EK_LabelDifference32: {
- unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
- unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
- unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
// movl (BReg,IReg64,4), OReg
BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
@@ -30783,8 +31138,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
DefRegs[MOp.getReg()] = true;
MachineInstrBuilder MIB(*MF, &II);
- for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
- unsigned Reg = SavedRegs[RI];
+ for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
+ unsigned Reg = SavedRegs[RegIdx];
if (!DefRegs[Reg])
MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
}
@@ -30906,20 +31261,18 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
// Load the old value of the control word...
- unsigned OldCW =
- MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
OrigCWFrameIdx);
// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
- unsigned NewCW =
- MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
+ Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
.addReg(OldCW, RegState::Kill).addImm(0xC00);
// Extract to 16 bits.
- unsigned NewCW16 =
- MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+ Register NewCW16 =
+ MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
@@ -31023,7 +31376,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
MVT SPTy = getPointerTy(MF->getDataLayout());
const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
- unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
+ Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
X86AddressMode AM = getAddressFromInstr(&MI, 0);
// Regalloc does not need any help when the memory operand of CMPXCHG8B
@@ -31034,10 +31387,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
// four operand definitions that are E[ABCD] registers. We skip them and
// then insert the LEA.
- MachineBasicBlock::iterator MBBI(MI);
- while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
- MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
- --MBBI;
+ MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
+ while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
+ RMBBI->definesRegister(X86::EBX) ||
+ RMBBI->definesRegister(X86::ECX) ||
+ RMBBI->definesRegister(X86::EDX))) {
+ ++RMBBI;
+ }
+ MachineBasicBlock::iterator MBBI(RMBBI);
addFullAddress(
BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
@@ -31232,12 +31589,21 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.One |= Known2.One;
break;
}
+ case X86ISD::PSADBW: {
+ assert(VT.getScalarType() == MVT::i64 &&
+ Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
+ "Unexpected PSADBW types");
+
+ // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
+ Known.Zero.setBitsFrom(16);
+ break;
+ }
case X86ISD::CMOV: {
- Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
+ Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
// If we don't know any bits, early out.
if (Known.isUnknown())
break;
- KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
// Only known if known in both the LHS and RHS.
Known.One &= Known2.One;
@@ -31650,8 +32016,8 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
- ArrayRef<int> LoMask(Mask.data() + 0, 4);
- ArrayRef<int> HiMask(Mask.data() + 4, 4);
+ ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
+ ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
// PSHUFLW: permute lower 4 elements only.
if (isUndefOrInRange(LoMask, 0, 4) &&
@@ -31789,8 +32155,8 @@ static bool matchBinaryPermuteShuffle(
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
- if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
- BlendMask)) {
+ if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
+ ForceV2Zero, BlendMask)) {
if (MaskVT == MVT::v16i16) {
// We can only use v16i16 PBLENDW if the lanes are repeated.
SmallVector<int, 8> RepeatedMask;
@@ -31819,15 +32185,15 @@ static bool matchBinaryPermuteShuffle(
}
}
- // Attempt to combine to INSERTPS.
+ // Attempt to combine to INSERTPS, but only if it has elements that need to
+ // be set to zero.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
- MaskVT.is128BitVector()) {
- if (Zeroable.getBoolValue() &&
- matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
- Shuffle = X86ISD::INSERTPS;
- ShuffleVT = MVT::v4f32;
- return true;
- }
+ MaskVT.is128BitVector() &&
+ llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
}
// Attempt to combine to SHUFPD.
@@ -31835,7 +32201,11 @@ static bool matchBinaryPermuteShuffle(
((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
- if (matchShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
+ bool ForceV1Zero = false, ForceV2Zero = false;
+ if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
+ PermuteImm, Mask, Zeroable)) {
+ V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+ V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
Shuffle = X86ISD::SHUFP;
ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
return true;
@@ -31889,6 +32259,15 @@ static bool matchBinaryPermuteShuffle(
}
}
+ // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
+ if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ MaskVT.is128BitVector() &&
+ matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
+ Shuffle = X86ISD::INSERTPS;
+ ShuffleVT = MVT::v4f32;
+ return true;
+ }
+
return false;
}
@@ -31942,7 +32321,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
- (RootVT.isFloatingPoint() && Depth >= 2) ||
+ (RootVT.isFloatingPoint() && Depth >= 1) ||
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
// Don't combine if we are a AVX512/EVEX target and the mask element size
@@ -31981,7 +32360,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
unsigned PermMask = 0;
@@ -31991,7 +32370,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
DAG.getUNDEF(ShuffleVT),
- DAG.getConstant(PermMask, DL, MVT::i8));
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -32026,8 +32405,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
// TODO: Should we indicate which domain is preferred if both are allowed?
- bool AllowFloatDomain = FloatDomain || (Depth > 3);
- bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
+ bool AllowFloatDomain = FloatDomain || (Depth >= 3);
+ bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
@@ -32062,14 +32441,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
MayFoldLoad(V1.getOperand(0))) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = V1.getOperand(0);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
return DAG.getBitcast(RootVT, Res);
}
if (Subtarget.hasAVX2()) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(MaskVT, V1);
Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
@@ -32083,7 +32462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
@@ -32094,11 +32473,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
- DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
}
@@ -32109,7 +32488,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
@@ -32123,12 +32502,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
- if (Depth == 1 && Root.getOpcode() == Shuffle)
+ if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
- DAG.getConstant(PermuteImm, DL, MVT::i8));
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -32141,34 +32520,34 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
uint64_t BitLen, BitIdx;
if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
Zeroable)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
- if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
V2 = DAG.getBitcast(IntMaskVT, V2);
Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
+ DAG.getTargetConstant(BitLen, DL, MVT::i8),
+ DAG.getTargetConstant(BitIdx, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
}
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
- if (Depth < 2)
+ if (Depth < 1)
return SDValue();
// Depth threshold above which we can efficiently use variable mask shuffles.
- int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
+ int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
bool MaskContainsZeros =
@@ -32321,7 +32700,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
V2 = DAG.getBitcast(MaskVT, V2);
SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
- DAG.getConstant(M2ZImm, DL, MVT::i8));
+ DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
return DAG.getBitcast(RootVT, Res);
}
@@ -32650,7 +33029,7 @@ static SDValue combineX86ShufflesRecursively(
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
- if (Depth > MaxRecursionDepth)
+ if (Depth >= MaxRecursionDepth)
return SDValue();
// Directly rip through bitcasts to find the underlying operand.
@@ -32667,11 +33046,18 @@ static SDValue combineX86ShufflesRecursively(
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
+ // TODO - determine Op's demanded elts from RootMask.
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
- if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
+ APInt OpUndef, OpZero;
+ APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
+ if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, DAG, Depth, false))
return SDValue();
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
// Add the inputs to the Ops list, avoiding duplicates.
SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
@@ -32772,6 +33158,9 @@ static SDValue combineX86ShufflesRecursively(
Mask[i] = OpMaskedIdx;
}
+ // Remove unused/repeated shuffle source ops.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+
// Handle the all undef/zero cases early.
if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
return DAG.getUNDEF(Root.getValueType());
@@ -32783,11 +33172,8 @@ static SDValue combineX86ShufflesRecursively(
return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
SDLoc(Root));
- // Remove unused/repeated shuffle source ops.
- resolveTargetShuffleInputsAndMask(Ops, Mask);
assert(!Ops.empty() && "Shuffle with no inputs detected");
-
- HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
+ HasVariableMask |= IsOpVariableMask;
// Update the list of shuffle nodes that have been combined so far.
SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
@@ -32853,7 +33239,7 @@ static SDValue combineX86ShufflesRecursively(
/// Helper entry wrapper to combineX86ShufflesRecursively.
static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
/*HasVarMask*/ false,
/*AllowVarMask*/ true, DAG, Subtarget);
}
@@ -33088,7 +33474,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
for (unsigned i = 0; i != Scale; ++i)
DemandedMask[i] = i;
if (SDValue Res = combineX86ShufflesRecursively(
- {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
+ {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getBitcast(SrcVT, Res));
@@ -33120,6 +33506,30 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
VT.getSizeInBits());
}
+ // vbroadcast(scalarload X) -> vbroadcast_load X
+ // For float loads, extract other uses of the scalar from the broadcast.
+ if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ // If the load value is used only by N, replace it via CombineTo N.
+ bool NoReplaceExtract = Src.hasOneUse();
+ DCI.CombineTo(N.getNode(), BcastLd);
+ if (NoReplaceExtract) {
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ } else {
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
+ DAG.getIntPtrConstant(0, DL));
+ DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
+ }
+ return N; // Return N so it doesn't get rechecked!
+ }
+
return SDValue();
}
case X86ISD::BLENDI: {
@@ -33133,14 +33543,14 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
MVT SrcVT = N0.getOperand(0).getSimpleValueType();
if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
SrcVT.getScalarSizeInBits() >= 32) {
- unsigned Mask = N.getConstantOperandVal(2);
+ unsigned BlendMask = N.getConstantOperandVal(2);
unsigned Size = VT.getVectorNumElements();
unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
- unsigned ScaleMask = scaleVectorShuffleBlendMask(Mask, Size, Scale);
+ BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
N1.getOperand(0),
- DAG.getConstant(ScaleMask, DL, MVT::i8)));
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
}
}
return SDValue();
@@ -33208,76 +33618,97 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// If we zero out all elements from Op0 then we don't need to reference it.
if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// If we zero out the element from Op1 then we don't need to reference it.
if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
// Attempt to merge insertps Op1 with an inner target shuffle node.
SmallVector<int, 8> TargetMask1;
SmallVector<SDValue, 2> Ops1;
- if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
- int M = TargetMask1[SrcIdx];
- if (isUndefOrZero(M)) {
+ APInt KnownUndef1, KnownZero1;
+ if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
+ KnownZero1)) {
+ if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
// Zero/UNDEF insertion - zero out element and remove dependency.
InsertPSMask |= (1u << DstIdx);
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
// Update insertps mask srcidx and reference the source input directly.
+ int M = TargetMask1[SrcIdx];
assert(0 <= M && M < 8 && "Shuffle index out of range");
InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
Op1 = Ops1[M < 4 ? 0 : 1];
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
// Attempt to merge insertps Op0 with an inner target shuffle node.
SmallVector<int, 8> TargetMask0;
SmallVector<SDValue, 2> Ops0;
- if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
- return SDValue();
+ APInt KnownUndef0, KnownZero0;
+ if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
+ KnownZero0)) {
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (KnownUndef0[i] || KnownZero0[i]) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
- bool Updated = false;
- bool UseInput00 = false;
- bool UseInput01 = false;
- for (int i = 0; i != 4; ++i) {
- int M = TargetMask0[i];
- if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
- // No change if element is already zero or the inserted element.
- continue;
- } else if (isUndefOrZero(M)) {
- // If the target mask is undef/zero then we must zero the element.
- InsertPSMask |= (1u << i);
- Updated = true;
- continue;
+ // The input vector element must be inline.
+ int M = TargetMask0[i];
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
}
- // The input vector element must be inline.
- if (M != i && M != (i + 4))
- return SDValue();
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Ops0[0];
+ } else if (!UseInput00 && UseInput01) {
+ Updated = true;
+ Op0 = Ops0[1];
+ }
- // Determine which inputs of the target shuffle we're using.
- UseInput00 |= (0 <= M && M < 4);
- UseInput01 |= (4 <= M);
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
+ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
}
- // If we're not using both inputs of the target shuffle then use the
- // referenced input directly.
- if (UseInput00 && !UseInput01) {
- Updated = true;
- Op0 = Ops0[0];
- } else if (!UseInput00 && UseInput01) {
- Updated = true;
- Op0 = Ops0[1];
+ // If we're inserting an element from a vbroadcast load, fold the
+ // load into the X86insertps instruction. We need to convert the scalar
+ // load to a vector and clear the source lane of the INSERTPS control.
+ if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
+ if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
+ SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(),
+ MemIntr->getMemOperand());
+ SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
+ Load),
+ DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+ return Insert;
+ }
}
- if (Updated)
- return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
- DAG.getConstant(InsertPSMask, DL, MVT::i8));
-
return SDValue();
}
default:
@@ -33580,7 +34011,7 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
}
/// Eliminate a redundant shuffle of a horizontal math op.
-static SDValue foldShuffleOfHorizOp(SDNode *N) {
+static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
@@ -33611,17 +34042,36 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
HOp.getOperand(0) != HOp.getOperand(1))
return SDValue();
+ // The shuffle that we are eliminating may have allowed the horizontal op to
+ // have an undemanded (undefined) operand. Duplicate the other (defined)
+ // operand to ensure that the results are defined across all lanes without the
+ // shuffle.
+ auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
+ SDValue X;
+ if (HorizOp.getOperand(0).isUndef()) {
+ assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op");
+ X = HorizOp.getOperand(1);
+ } else if (HorizOp.getOperand(1).isUndef()) {
+ assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op");
+ X = HorizOp.getOperand(0);
+ } else {
+ return HorizOp;
+ }
+ return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
+ HorizOp.getValueType(), X, X);
+ };
+
// When the operands of a horizontal math op are identical, the low half of
// the result is the same as the high half. If a target shuffle is also
- // replicating low and high halves, we don't need the shuffle.
+ // replicating low and high halves (and without changing the type/length of
+ // the vector), we don't need the shuffle.
if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
- if (HOp.getScalarValueSizeInBits() == 64) {
+ if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
// movddup (hadd X, X) --> hadd X, X
// broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
assert((HOp.getValueType() == MVT::v2f64 ||
- HOp.getValueType() == MVT::v4f64) && HOp.getValueType() == VT &&
- "Unexpected type for h-op");
- return HOp;
+ HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op");
+ return updateHOp(HOp, DAG);
}
return SDValue();
}
@@ -33635,14 +34085,14 @@ static SDValue foldShuffleOfHorizOp(SDNode *N) {
(isTargetShuffleEquivalent(Mask, {0, 0}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
- return HOp;
+ return updateHOp(HOp, DAG);
if (HOp.getValueSizeInBits() == 256 &&
(isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
isTargetShuffleEquivalent(
Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
- return HOp;
+ return updateHOp(HOp, DAG);
return SDValue();
}
@@ -33677,7 +34127,7 @@ static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
// the wide shuffle that we started with.
return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
Shuf->getOperand(1), HalfMask, HalfIdx1,
- HalfIdx2, false, DAG);
+ HalfIdx2, false, DAG, /*UseConcat*/true);
}
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
@@ -33696,70 +34146,10 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
- if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+ if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
return HAddSub;
}
- // During Type Legalization, when promoting illegal vector types,
- // the backend might introduce new shuffle dag nodes and bitcasts.
- //
- // This code performs the following transformation:
- // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
- // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
- //
- // We do this only if both the bitcast and the BINOP dag nodes have
- // one use. Also, perform this transformation only if the new binary
- // operation is legal. This is to avoid introducing dag nodes that
- // potentially need to be further expanded (or custom lowered) into a
- // less optimal sequence of dag nodes.
- if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
- N->getOpcode() == ISD::VECTOR_SHUFFLE &&
- N->getOperand(0).getOpcode() == ISD::BITCAST &&
- N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- SDValue BC0 = N0.getOperand(0);
- EVT SVT = BC0.getValueType();
- unsigned Opcode = BC0.getOpcode();
- unsigned NumElts = VT.getVectorNumElements();
-
- if (BC0.hasOneUse() && SVT.isVector() &&
- SVT.getVectorNumElements() * 2 == NumElts &&
- TLI.isOperationLegal(Opcode, VT)) {
- bool CanFold = false;
- switch (Opcode) {
- default : break;
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL:
- // isOperationLegal lies for integer ops on floating point types.
- CanFold = VT.isInteger();
- break;
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- // isOperationLegal lies for floating point ops on integer types.
- CanFold = VT.isFloatingPoint();
- break;
- }
-
- unsigned SVTNumElts = SVT.getVectorNumElements();
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
- CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
- for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
- CanFold = SVOp->getMaskElt(i) < 0;
-
- if (CanFold) {
- SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
- SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
- SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
- return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
- }
- }
- }
-
// Attempt to combine into a vector load/broadcast.
if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
return LD;
@@ -33841,7 +34231,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
ISD::isNormalLoad(N->getOperand(0).getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- if (!LN->isVolatile()) {
+ if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
SDValue VZLoad =
@@ -33855,53 +34245,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
}
-
- // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
- // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
- // FIXME: This can probably go away once we default to widening legalization.
- if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
- N->getOpcode() == ISD::VECTOR_SHUFFLE &&
- N->getOperand(0).getOpcode() == ISD::BITCAST &&
- N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
- SDValue BC = N->getOperand(0);
- SDValue MULUDQ = BC.getOperand(0);
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
- ArrayRef<int> Mask = SVOp->getMask();
- if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
- Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
- SDValue Op0 = MULUDQ.getOperand(0);
- SDValue Op1 = MULUDQ.getOperand(1);
- if (Op0.getOpcode() == ISD::BITCAST &&
- Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
- Op0.getOperand(0).getValueType() == MVT::v4i32) {
- ShuffleVectorSDNode *SVOp0 =
- cast<ShuffleVectorSDNode>(Op0.getOperand(0));
- ArrayRef<int> Mask2 = SVOp0->getMask();
- if (Mask2[0] == 0 && Mask2[1] == -1 &&
- Mask2[2] == 1 && Mask2[3] == -1) {
- Op0 = SVOp0->getOperand(0);
- Op1 = DAG.getBitcast(MVT::v4i32, Op1);
- Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
- return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
- }
- }
- if (Op1.getOpcode() == ISD::BITCAST &&
- Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
- Op1.getOperand(0).getValueType() == MVT::v4i32) {
- ShuffleVectorSDNode *SVOp1 =
- cast<ShuffleVectorSDNode>(Op1.getOperand(0));
- ArrayRef<int> Mask2 = SVOp1->getMask();
- if (Mask2[0] == 0 && Mask2[1] == -1 &&
- Mask2[2] == 1 && Mask2[3] == -1) {
- Op0 = DAG.getBitcast(MVT::v4i32, Op0);
- Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
- Op1 = SVOp1->getOperand(0);
- return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
- }
- }
- }
- }
-
return SDValue();
}
@@ -33966,6 +34309,84 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// TODO convert SrcUndef to KnownUndef.
break;
}
+ case X86ISD::KSHIFTL: {
+ SDValue Src = Op.getOperand(0);
+ auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+ assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+ unsigned ShiftAmt = Amt->getZExtValue();
+
+ if (ShiftAmt == 0)
+ return TLO.CombineTo(Op, Src);
+
+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the bottom bits (which are shifted
+ // out) are never demanded.
+ if (Src.getOpcode() == X86ISD::KSHIFTR) {
+ if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
+ unsigned C1 = Src.getConstantOperandVal(1);
+ unsigned NewOpc = X86ISD::KSHIFTL;
+ int Diff = ShiftAmt - C1;
+ if (Diff < 0) {
+ Diff = -Diff;
+ NewOpc = X86ISD::KSHIFTR;
+ }
+
+ SDLoc dl(Op);
+ SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+ }
+ }
+
+ APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ KnownUndef <<= ShiftAmt;
+ KnownZero <<= ShiftAmt;
+ KnownZero.setLowBits(ShiftAmt);
+ break;
+ }
+ case X86ISD::KSHIFTR: {
+ SDValue Src = Op.getOperand(0);
+ auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
+ assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
+ unsigned ShiftAmt = Amt->getZExtValue();
+
+ if (ShiftAmt == 0)
+ return TLO.CombineTo(Op, Src);
+
+ // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the top bits (which are shifted
+ // out) are never demanded.
+ if (Src.getOpcode() == X86ISD::KSHIFTL) {
+ if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
+ unsigned C1 = Src.getConstantOperandVal(1);
+ unsigned NewOpc = X86ISD::KSHIFTR;
+ int Diff = ShiftAmt - C1;
+ if (Diff < 0) {
+ Diff = -Diff;
+ NewOpc = X86ISD::KSHIFTL;
+ }
+
+ SDLoc dl(Op);
+ SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
+ }
+ }
+
+ APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
+ if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
+ Depth + 1))
+ return true;
+
+ KnownUndef.lshrInPlace(ShiftAmt);
+ KnownZero.lshrInPlace(ShiftAmt);
+ KnownZero.setHighBits(ShiftAmt);
+ break;
+ }
case X86ISD::CVTSI2P:
case X86ISD::CVTUI2P: {
SDValue Src = Op.getOperand(0);
@@ -33979,16 +34400,36 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
case X86ISD::PACKSS:
case X86ISD::PACKUS: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+
APInt DemandedLHS, DemandedRHS;
getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
APInt SrcUndef, SrcZero;
- if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
- SrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
+ Depth + 1))
return true;
- if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
- SrcZero, TLO, Depth + 1))
+ if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
+ Depth + 1))
return true;
+
+ // Aggressively peek through ops to get at the demanded elts.
+ // TODO - we should do this for all target/faux shuffles ops.
+ if (!DemandedElts.isAllOnesValue()) {
+ APInt DemandedSrcBits =
+ APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
+ SDValue NewN0 = SimplifyMultipleUseDemandedBits(
+ N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedBits(
+ N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
+ if (NewN0 || NewN1) {
+ NewN0 = NewN0 ? NewN0 : N0;
+ NewN1 = NewN1 ? NewN1 : N1;
+ return TLO.CombineTo(Op,
+ TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
+ }
+ }
break;
}
case X86ISD::HADD:
@@ -34062,25 +34503,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
break;
}
- case X86ISD::SUBV_BROADCAST: {
- // Reduce size of broadcast if we don't need the upper half.
- unsigned HalfElts = NumElts / 2;
- if (DemandedElts.extractBits(HalfElts, HalfElts).isNullValue()) {
- SDValue Src = Op.getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
-
- SDValue Half = Src;
- if (SrcVT.getVectorNumElements() != HalfElts) {
- MVT HalfVT = MVT::getVectorVT(SrcVT.getScalarType(), HalfElts);
- Half = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, SDLoc(Op), HalfVT, Src);
- }
-
- return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Half, 0,
- TLO.DAG, SDLoc(Op),
- Half.getValueSizeInBits()));
- }
- break;
- }
case X86ISD::VPERMV: {
SDValue Mask = Op.getOperand(0);
APInt MaskUndef, MaskZero;
@@ -34135,6 +34557,21 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
}
+ // Subvector broadcast.
+ case X86ISD::SUBV_BROADCAST: {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ if (Src.getValueSizeInBits() > ExtSizeInBits)
+ Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
+ else if (Src.getValueSizeInBits() < ExtSizeInBits) {
+ MVT SrcSVT = Src.getSimpleValueType().getScalarType();
+ MVT SrcVT =
+ MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
+ Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
+ }
+ return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
+ TLO.DAG, DL, ExtSizeInBits));
+ }
// Byte shifts by immediate.
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -34201,36 +34638,30 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
}
- // Simplify target shuffles.
- if (!isTargetShuffle(Opc) || !VT.isSimple())
- return false;
-
- // Get target shuffle mask.
- bool IsUnary;
+ // Get target/faux shuffle mask.
+ APInt OpUndef, OpZero;
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
- if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
- OpMask, IsUnary))
+ if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
+ OpZero, TLO.DAG, Depth, false))
return false;
- // Shuffle inputs must be the same type as the result.
- if (llvm::any_of(OpInputs,
- [VT](SDValue V) { return VT != V.getValueType(); }))
+ // Shuffle inputs must be the same size as the result.
+ if (OpMask.size() != (unsigned)NumElts ||
+ llvm::any_of(OpInputs, [VT](SDValue V) {
+ return VT.getSizeInBits() != V.getValueSizeInBits() ||
+ !V.getValueType().isVector();
+ }))
return false;
- // Clear known elts that might have been set above.
- KnownZero.clearAllBits();
- KnownUndef.clearAllBits();
+ KnownZero = OpZero;
+ KnownUndef = OpUndef;
// Check if shuffle mask can be simplified to undef/zero/identity.
int NumSrcs = OpInputs.size();
- for (int i = 0; i != NumElts; ++i) {
- int &M = OpMask[i];
+ for (int i = 0; i != NumElts; ++i)
if (!DemandedElts[i])
- M = SM_SentinelUndef;
- else if (0 <= M && OpInputs[M / NumElts].isUndef())
- M = SM_SentinelUndef;
- }
+ OpMask[i] = SM_SentinelUndef;
if (isUndefInRange(OpMask, 0, NumElts)) {
KnownUndef.setAllBits();
@@ -34243,10 +34674,14 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
for (int Src = 0; Src != NumSrcs; ++Src)
if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
- return TLO.CombineTo(Op, OpInputs[Src]);
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
// Attempt to simplify inputs.
for (int Src = 0; Src != NumSrcs; ++Src) {
+ // TODO: Support inputs of different types.
+ if (OpInputs[Src].getValueType() != VT)
+ continue;
+
int Lo = Src * NumElts;
APInt SrcElts = APInt::getNullValue(NumElts);
for (int i = 0; i != NumElts; ++i)
@@ -34256,21 +34691,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SrcElts.setBit(M);
}
+ // TODO - Propagate input undef/zero elts.
APInt SrcUndef, SrcZero;
if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
TLO, Depth + 1))
return true;
}
- // Extract known zero/undef elements.
- // TODO - Propagate input undef/zero elts.
- for (int i = 0; i != NumElts; ++i) {
- if (OpMask[i] == SM_SentinelUndef)
- KnownUndef.setBit(i);
- if (OpMask[i] == SM_SentinelZero)
- KnownZero.setBit(i);
- }
-
return false;
}
@@ -34296,6 +34723,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
TLO, Depth + 1))
return true;
+
+ // Aggressively peek through ops to get at the demanded low bits.
+ SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
+ LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
+ RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
+ if (DemandedLHS || DemandedRHS) {
+ DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
+ DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
+ }
break;
}
case X86ISD::VSHLI: {
@@ -34323,7 +34762,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
SDValue NewShift = TLO.DAG.getNode(
NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
- TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
+ TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
return TLO.CombineTo(Op, NewShift);
}
}
@@ -34441,6 +34880,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
KnownVec, TLO, Depth + 1))
return true;
+ if (SDValue V = SimplifyMultipleUseDemandedBits(
+ Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
+
Known = KnownVec.zext(BitWidth, true);
return false;
}
@@ -34542,12 +34986,80 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
}
+SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const {
+ int NumElts = DemandedElts.getBitWidth();
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+
+ switch (Opc) {
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW: {
+ // If we don't demand the inserted element, return the base vector.
+ SDValue Vec = Op.getOperand(0);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ MVT VecVT = Vec.getSimpleValueType();
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+ !DemandedElts[CIdx->getZExtValue()])
+ return Vec;
+ break;
+ }
+ }
+
+ APInt ShuffleUndef, ShuffleZero;
+ SmallVector<int, 16> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleOps;
+ if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
+ ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
+ // If all the demanded elts are from one operand and are inline,
+ // then we can use the operand directly.
+ int NumOps = ShuffleOps.size();
+ if (ShuffleMask.size() == (unsigned)NumElts &&
+ llvm::all_of(ShuffleOps, [VT](SDValue V) {
+ return VT.getSizeInBits() == V.getValueSizeInBits();
+ })) {
+
+ if (DemandedElts.isSubsetOf(ShuffleUndef))
+ return DAG.getUNDEF(VT);
+ if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
+ return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
+
+ // Bitmask that indicates which ops have only been accessed 'inline'.
+ APInt IdentityOp = APInt::getAllOnesValue(NumOps);
+ for (int i = 0; i != NumElts; ++i) {
+ int M = ShuffleMask[i];
+ if (!DemandedElts[i] || ShuffleUndef[i])
+ continue;
+ int Op = M / NumElts;
+ int Index = M % NumElts;
+ if (M < 0 || Index != i) {
+ IdentityOp.clearAllBits();
+ break;
+ }
+ IdentityOp &= APInt::getOneBitSet(NumOps, Op);
+ if (IdentityOp == 0)
+ break;
+ }
+ assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
+ "Multiple identity shuffles detected");
+
+ if (IdentityOp != 0)
+ return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
+ }
+ }
+
+ return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+ Op, DemandedBits, DemandedElts, DAG, Depth);
+}
+
/// Check if a vector extract from a target-specific shuffle of a load can be
/// folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
/// shuffles have been custom lowered so we need to handle those here.
-static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue
+XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -34559,13 +35071,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT OriginalVT = InVec.getValueType();
+ unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
// Peek through bitcasts, don't duplicate a load with other uses.
InVec = peekThroughOneUseBitcasts(InVec);
EVT CurrentVT = InVec.getValueType();
- if (!CurrentVT.isVector() ||
- CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+ if (!CurrentVT.isVector())
+ return SDValue();
+
+ unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
+ if ((NumOriginalElts % NumCurrentElts) != 0)
return SDValue();
if (!isTargetShuffle(InVec.getOpcode()))
@@ -34582,10 +35098,17 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
ShuffleOps, ShuffleMask, UnaryShuffle))
return SDValue();
+ unsigned Scale = NumOriginalElts / NumCurrentElts;
+ if (Scale > 1) {
+ SmallVector<int, 16> ScaledMask;
+ scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
+ ShuffleMask = std::move(ScaledMask);
+ }
+ assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
+
// Select the input vector, guarding against out of range extract vector.
- unsigned NumElems = CurrentVT.getVectorNumElements();
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
- int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
+ int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
if (Idx == SM_SentinelZero)
return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
@@ -34598,8 +35121,9 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
return SDValue();
- assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
- SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+ assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
+ "Shuffle index out of range");
+ SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
// If inputs to shuffle are the same for both ops, then allow 2 uses
unsigned AllowedUses =
@@ -34619,7 +35143,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
- if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+ if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
return SDValue();
// If there's a bitcast before the shuffle, check if the load type and
@@ -34637,10 +35161,11 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
- Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
- ShuffleMask);
- Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
+ SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
+ : DAG.getBitcast(OriginalVT, ShuffleOps[1]);
+ Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
+ DAG.getBitcast(OriginalVT, ShuffleOps[0]),
+ Shuffle, ShuffleMask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
@@ -34660,6 +35185,23 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
return false;
}
+// Helper to push sign extension of vXi1 SETCC result through bitops.
+static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
+ SDValue Src, const SDLoc &DL) {
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ return DAG.getNode(
+ Src.getOpcode(), DL, SExtVT,
+ signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
+ signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
+ }
+ llvm_unreachable("Unexpected node type for vXi1 sign extension");
+}
+
// Try to match patterns such as
// (i16 bitcast (v16i1 x))
// ->
@@ -34698,6 +35240,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
+ bool PropagateSExt = false;
switch (SrcVT.getSimpleVT().SimpleTy) {
default:
return SDValue();
@@ -34708,8 +35251,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
SExtVT = MVT::v4i32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256))
+ if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
SExtVT = MVT::v4i64;
+ PropagateSExt = true;
+ }
break;
case MVT::v8i1:
SExtVT = MVT::v8i16;
@@ -34718,11 +35263,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- // TODO : use checkBitcastSrcVectorSize
- if (Src.getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
- (Src.getOperand(0).getValueType().is256BitVector() ||
- Src.getOperand(0).getValueType().is512BitVector())) {
+ if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
+ checkBitcastSrcVectorSize(Src, 512))) {
SExtVT = MVT::v8i32;
+ PropagateSExt = true;
}
break;
case MVT::v16i1:
@@ -34745,19 +35289,10 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
return SDValue();
};
- SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+ SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+ : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
- if (SExtVT == MVT::v64i8) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
- Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
- Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
- Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
- Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
- Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
- DAG.getConstant(32, DL, MVT::i8));
- V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
- } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
+ if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
V = getPMOVMSKB(DL, V, DAG, Subtarget);
} else {
if (SExtVT == MVT::v8i16)
@@ -34891,8 +35426,8 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
return DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
- DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
- DAG.getConstant(ShufMask, DL, MVT::i8));
+ DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
+ Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
}
Ops.append(NumElts, Splat);
} else {
@@ -34935,6 +35470,24 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
+ // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
+ // legalization destroys the v4i32 type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
+ VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
+ N0.getOperand(0).getValueType() == MVT::v4i32 &&
+ ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
+ SDValue N00 = N0.getOperand(0);
+ // Only do this if we can avoid scalarizing the input.
+ if (ISD::isNormalLoad(N00.getNode()) ||
+ (N00.getOpcode() == ISD::BITCAST &&
+ N00.getOperand(0).getValueType() == MVT::v4f32)) {
+ SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4f32, N00));
+ return DAG.getZExtOrTrunc(V, dl, VT);
+ }
+ }
+
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -34949,6 +35502,26 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// type, widen both sides to avoid a trip through memory.
if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
Subtarget.hasAVX512()) {
+ // Use zeros for the widening if we already have some zeroes. This can
+ // allow SimplifyDemandedBits to remove scalar ANDs that may be down
+ // stream of this.
+ // FIXME: It might make sense to detect a concat_vectors with a mix of
+ // zeroes and undef and turn it into insert_subvector for i1 vectors as
+ // a separate combine. What we can't do is canonicalize the operands of
+ // such a concat or we'll get into a loop with SimplifyDemandedBits.
+ if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
+ SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
+ if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
+ SrcVT = LastOp.getValueType();
+ unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
+ SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
+ Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
+ N0 = DAG.getBitcast(MVT::i8, N0);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+ }
+ }
+
unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
Ops[0] = N0;
@@ -34958,6 +35531,33 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
}
+ // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
+ // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
+ // due to insert_subvector legalization on KNL. By promoting the copy to i16
+ // we can help with known bits propagation from the vXi1 domain to the
+ // scalar domain.
+ if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
+ !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getOperand(0).getValueType() == MVT::v16i1 &&
+ isNullConstant(N0.getOperand(1)))
+ return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
+ DAG.getBitcast(MVT::i16, N0.getOperand(0)));
+
+ // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
+ // determines // the number of bits loaded. Remaining bits are zero.
+ if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
+ VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
+ auto *BCast = cast<MemIntrinsicSDNode>(N0);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ VT.getVectorElementType(),
+ BCast->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+ return ResNode;
+ }
+
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
@@ -35152,7 +35752,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
ISD::NodeType BinOp;
SDValue Src = DAG.matchBinOpReduction(
- Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
+ Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
if (!Src)
return SDValue();
@@ -35246,29 +35846,31 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SDLoc DL(Extract);
EVT MatchVT = Match.getValueType();
unsigned NumElts = MatchVT.getVectorNumElements();
+ unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ExtractVT == MVT::i1) {
// Special case for (pre-legalization) vXi1 reductions.
- if (NumElts > 32)
+ if (NumElts > 64 || !isPowerOf2_32(NumElts))
return SDValue();
- if (DAG.getTargetLoweringInfo().isTypeLegal(MatchVT)) {
+ if (TLI.isTypeLegal(MatchVT)) {
// If this is a legal AVX512 predicate type then we can just bitcast.
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
// Use combineBitcastvxi1 to create the MOVMSK.
- if (NumElts == 32 && !Subtarget.hasInt256()) {
+ while (NumElts > MaxElts) {
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
- NumElts = 16;
+ NumElts /= 2;
}
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
}
if (!Movmsk)
return SDValue();
- Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, MVT::i32);
+ Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
} else {
// Bail with AVX512VL (which uses predicate registers).
if (Subtarget.hasVLX())
@@ -35309,13 +35911,15 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
NumElts = MaskSrcVT.getVectorNumElements();
}
- assert(NumElts <= 32 && "Not expecting more than 32 elements");
+ assert((NumElts <= 32 || NumElts == 64) &&
+ "Not expecting more than 64 elements");
+ MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
if (BinOp == ISD::XOR) {
// parity -> (AND (CTPOP(MOVMSK X)), 1)
- SDValue Mask = DAG.getConstant(1, DL, MVT::i32);
- SDValue Result = DAG.getNode(ISD::CTPOP, DL, MVT::i32, Movmsk);
- Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, Mask);
+ SDValue Mask = DAG.getConstant(1, DL, CmpVT);
+ SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
+ Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
}
@@ -35323,19 +35927,19 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
ISD::CondCode CondCode;
if (BinOp == ISD::OR) {
// any_of -> MOVMSK != 0
- CmpC = DAG.getConstant(0, DL, MVT::i32);
+ CmpC = DAG.getConstant(0, DL, CmpVT);
CondCode = ISD::CondCode::SETNE;
} else {
// all_of -> MOVMSK == ((1 << NumElts) - 1)
- CmpC = DAG.getConstant((1ULL << NumElts) - 1, DL, MVT::i32);
+ CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
+ DL, CmpVT);
CondCode = ISD::CondCode::SETEQ;
}
// The setcc produces an i8 of 0/1, so extend that to the result width and
// negate to get the final 0/-1 mask value.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT SetccVT =
- TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
@@ -35431,6 +36035,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDLoc dl(N);
SDValue Src = N->getOperand(0);
SDValue Idx = N->getOperand(1);
@@ -35452,10 +36057,37 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT, SrcOp);
}
+ // If we're extracting a single element from a broadcast load and there are
+ // no other users, just create a single load.
+ if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
+ unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
+ if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
+ VT.getSizeInBits() == SrcBCWidth) {
+ SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
+ MemIntr->getBasePtr(),
+ MemIntr->getPointerInfo(),
+ MemIntr->getAlignment(),
+ MemIntr->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
+ return Load;
+ }
+ }
+
+ // Handle extract(truncate(x)) for 0'th index.
+ // TODO: Treat this as a faux shuffle?
+ // TODO: When can we use this for general indices?
+ if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
+ isNullConstant(Idx)) {
+ Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
+ Src = DAG.getBitcast(SrcVT, Src);
+ return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
+ }
+
// Resolve the target shuffle inputs and mask.
SmallVector<int, 16> Mask;
SmallVector<SDValue, 2> Ops;
- if (!resolveTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
+ if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
// Attempt to narrow/widen the shuffle mask to the correct size.
@@ -35489,7 +36121,6 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
int SrcIdx = Mask[N->getConstantOperandVal(1)];
- SDLoc dl(N);
// If the shuffle source element is undef/zero then we can just accept it.
if (SrcIdx == SM_SentinelUndef)
@@ -35584,7 +36215,7 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
}
// TODO: This switch could include FNEG and the x86-specific FP logic ops
- // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
+ // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
// missed load folding and fma+fneg combining.
switch (Vec.getOpcode()) {
case ISD::FMA: // Begin 3 operands
@@ -35631,27 +36262,84 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
- if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
- return SDValue();
- SDValue Index = ExtElt->getOperand(1);
- if (!isNullConstant(Index))
- return SDValue();
- // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros.
ISD::NodeType Opc;
- SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD});
+ SDValue Rdx =
+ DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
if (!Rdx)
return SDValue();
+ SDValue Index = ExtElt->getOperand(1);
+ assert(isNullConstant(Index) &&
+ "Reduction doesn't end in an extract from index 0");
+
EVT VT = ExtElt->getValueType(0);
- EVT VecVT = ExtElt->getOperand(0).getValueType();
+ EVT VecVT = Rdx.getValueType();
if (VecVT.getScalarType() != VT)
return SDValue();
- unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
SDLoc DL(ExtElt);
+ // vXi8 reduction - sub 128-bit vector.
+ if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
+ if (VecVT == MVT::v4i8) {
+ // Pad with zero.
+ if (Subtarget.hasSSE41()) {
+ Rdx = DAG.getBitcast(MVT::i32, Rdx);
+ Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+ DAG.getConstant(0, DL, MVT::v4i32), Rdx,
+ DAG.getIntPtrConstant(0, DL));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ } else {
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
+ DAG.getConstant(0, DL, VecVT));
+ }
+ }
+ if (Rdx.getValueType() == MVT::v8i8) {
+ // Pad with undef.
+ Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i8));
+ }
+ Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+ DAG.getConstant(0, DL, MVT::v16i8));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // Must be a >=128-bit vector with pow2 elements.
+ if ((VecVT.getSizeInBits() % 128) != 0 ||
+ !isPowerOf2_32(VecVT.getVectorNumElements()))
+ return SDValue();
+
+ // vXi8 reduction - sum lo/hi halves then use PSADBW.
+ if (VT == MVT::i8) {
+ while (Rdx.getValueSizeInBits() > 128) {
+ unsigned HalfSize = VecVT.getSizeInBits() / 2;
+ unsigned HalfElts = VecVT.getVectorNumElements() / 2;
+ SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
+ SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
+ Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
+ VecVT = Rdx.getValueType();
+ }
+ assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
+
+ SDValue Hi = DAG.getVectorShuffle(
+ MVT::v16i8, DL, Rdx, Rdx,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
+ Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ }
+
+ // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
+ bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ return SDValue();
+
+ unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+
// 256-bit horizontal instructions operate on 128-bit chunks rather than
// across the whole vector, so we need an extract + hop preliminary stage.
// This is the only step where the operands of the hop are not the same value.
@@ -35661,15 +36349,14 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
unsigned NumElts = VecVT.getVectorNumElements();
SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
- VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2);
- Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo);
+ Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
+ VecVT = Rdx.getValueType();
}
if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
return SDValue();
// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
- assert(Rdx.getValueType() == VecVT && "Unexpected reduction match");
unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
for (unsigned i = 0; i != ReductionSteps; ++i)
Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
@@ -35714,15 +36401,26 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
}
- // TODO - Remove this once we can handle the implicit zero-extension of
- // X86ISD::PEXTRW/X86ISD::PEXTRB in:
- // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
- // combineBasicSADPattern.
if (IsPextr) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(
SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
return SDValue(N, 0);
+
+ // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
+ if ((InputVector.getOpcode() == X86ISD::PINSRB ||
+ InputVector.getOpcode() == X86ISD::PINSRW) &&
+ InputVector.getOperand(2) == EltIdx) {
+ assert(SrcVT == InputVector.getOperand(0).getValueType() &&
+ "Vector type mismatch");
+ SDValue Scl = InputVector.getOperand(1);
+ Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
+ return DAG.getZExtOrTrunc(Scl, dl, VT);
+ }
+
+ // TODO - Remove this once we can handle the implicit zero-extension of
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
+ // combineHorizontalPredicateResult and combineBasicSADPattern.
return SDValue();
}
@@ -35832,6 +36530,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
// get simplified at node creation time)?
bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+ // If both inputs are 0/undef, create a complete zero vector.
+ // FIXME: As noted above this should be handled by DAGCombiner/getNode.
+ if (TValIsAllZeros && FValIsAllZeros) {
+ if (VT.isFloatingPoint())
+ return DAG.getConstantFP(0.0, DL, VT);
+ return DAG.getConstant(0, DL, VT);
+ }
+
if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
@@ -36295,8 +37002,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Since SKX these selects have a proper lowering.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
CondVT.getVectorElementType() == MVT::i1 &&
- (ExperimentalVectorWideningLegalization ||
- VT.getVectorNumElements() > 4) &&
(VT.getVectorElementType() == MVT::i8 ||
VT.getVectorElementType() == MVT::i16)) {
Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
@@ -36358,6 +37063,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// subl %esi, $edi
// cmovsl %eax, %edi
if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+ Cond.hasOneUse() &&
DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
DAG.isEqualTo(RHS, Cond.getOperand(1))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -36508,6 +37214,12 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
return V;
+ // select(~Cond, X, Y) -> select(Cond, Y, X)
+ if (CondVT.getScalarType() != MVT::i1)
+ if (SDValue CondNot = IsNOT(Cond, DAG))
+ return DAG.getNode(N->getOpcode(), DL, VT,
+ DAG.getBitcast(CondVT, CondNot), RHS, LHS);
+
// Custom action for SELECT MMX
if (VT == MVT::x86mmx) {
LHS = DAG.getBitcast(MVT::i64, LHS);
@@ -36873,8 +37585,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
- SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
- Flags};
+ SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
+ Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
@@ -36923,12 +37635,13 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Optimize cases that will turn into an LEA instruction. This requires
// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
- uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
- if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+ APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
+ assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
+ "Implicit constant truncation");
bool isFastMultiplier = false;
- if (Diff < 10) {
- switch ((unsigned char)Diff) {
+ if (Diff.ult(10)) {
+ switch (Diff.getZExtValue()) {
default: break;
case 1: // result = add base, cond
case 2: // result = lea base( , cond*2)
@@ -36943,7 +37656,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
}
if (isFastMultiplier) {
- APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
Cond = getSETCC(CC, Cond, DL ,DAG);
// Zero extend the condition if needed.
Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
@@ -36994,8 +37706,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
if (CC == X86::COND_E &&
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
- SDValue Ops[] = { FalseOp, Cond.getOperand(0),
- DAG.getConstant(CC, DL, MVT::i8), Cond };
+ SDValue Ops[] = {FalseOp, Cond.getOperand(0),
+ DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
@@ -37029,10 +37741,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
CC1 = X86::GetOppositeBranchCondition(CC1);
}
- SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
- Flags};
+ SDValue LOps[] = {FalseOp, TrueOp,
+ DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
- SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
+ SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
+ Flags};
SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
return CMOV;
}
@@ -37064,9 +37777,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
// This should constant fold.
SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
- SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
- DAG.getConstant(X86::COND_NE, DL, MVT::i8),
- Cond);
+ SDValue CMov =
+ DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
+ DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
}
}
@@ -37166,98 +37879,45 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
if ((NumElts % 2) != 0)
return SDValue();
- unsigned RegSize = 128;
- MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
- if (ExperimentalVectorWideningLegalization ||
- NumElts >= OpsVT.getVectorNumElements()) {
- // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
- // lower part is needed.
- SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
- if (Mode == MULU8 || Mode == MULS8)
- return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
- DL, VT, MulLo);
-
- MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
- // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
- // the higher part is also needed.
- SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- ReducedVT, NewN0, NewN1);
-
- // Repack the lower part and higher part result of mul into a wider
- // result.
- // Generate shuffle functioning as punpcklwd.
- SmallVector<int, 16> ShuffleMask(NumElts);
- for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
- ShuffleMask[2 * i] = i;
- ShuffleMask[2 * i + 1] = i + NumElts;
- }
- SDValue ResLo =
- DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResLo = DAG.getBitcast(ResVT, ResLo);
- // Generate shuffle functioning as punpckhwd.
- for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
- ShuffleMask[2 * i] = i + NumElts / 2;
- ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
- }
- SDValue ResHi =
- DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResHi = DAG.getBitcast(ResVT, ResHi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
- }
-
- // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
- // to legalize the mul explicitly because implicit legalization for type
- // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
- // instructions which will not exist when we explicitly legalize it by
- // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
- // <4 x i16> undef).
- //
- // Legalize the operands of mul.
- // FIXME: We may be able to handle non-concatenated vectors by insertion.
- unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
- if ((RegSize % ReducedSizeInBits) != 0)
- return SDValue();
-
- SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
- DAG.getUNDEF(ReducedVT));
- Ops[0] = NewN0;
- NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
- Ops[0] = NewN1;
- NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
-
- if (Mode == MULU8 || Mode == MULS8) {
- // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
- // part is needed.
- SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
-
- // convert the type of mul result to VT.
- MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
- SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
- : ISD::SIGN_EXTEND_VECTOR_INREG,
- DL, ResVT, Mul);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getIntPtrConstant(0, DL));
- }
+ // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
+ // lower part is needed.
+ SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
+ if (Mode == MULU8 || Mode == MULS8)
+ return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ DL, VT, MulLo);
- // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
- // MULU16/MULS16, both parts are needed.
- SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
+ MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+ // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
+ // the higher part is also needed.
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- OpsVT, NewN0, NewN1);
+ ReducedVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
- // result. Make sure the type of mul result is VT.
- MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
- SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
- Res = DAG.getBitcast(ResVT, Res);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
- DAG.getIntPtrConstant(0, DL));
+ // result.
+ // Generate shuffle functioning as punpcklwd.
+ SmallVector<int, 16> ShuffleMask(NumElts);
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i;
+ ShuffleMask[2 * i + 1] = i + NumElts;
+ }
+ SDValue ResLo =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResLo = DAG.getBitcast(ResVT, ResLo);
+ // Generate shuffle functioning as punpckhwd.
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i + NumElts / 2;
+ ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
+ }
+ SDValue ResHi =
+ DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
+ ResHi = DAG.getBitcast(ResVT, ResHi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
}
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
@@ -37365,8 +38025,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
// Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
// Also allow v2i32 if it will be widened.
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
- if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
- DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -37919,7 +38578,7 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
if (NewShiftVal >= NumBitsPerElt)
NewShiftVal = NumBitsPerElt - 1;
return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
- DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
+ DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
}
// We can decode 'whole byte' logical bit shifts as shuffles.
@@ -38039,7 +38698,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasAVX512()) {
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
- DAG.getConstant(x86cc, DL, MVT::i8));
+ DAG.getTargetConstant(x86cc, DL, MVT::i8));
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
@@ -38048,10 +38707,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
N->getSimpleValueType(0));
}
- SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
- CMP00.getValueType(), CMP00, CMP01,
- DAG.getConstant(x86cc, DL,
- MVT::i8));
+ SDValue OnesOrZeroesF =
+ DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
+ CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
bool is64BitFP = (CMP00.getValueType() == MVT::f64);
MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
@@ -38083,34 +38741,6 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Match (xor X, -1) -> X.
-// Match extract_subvector(xor X, -1) -> extract_subvector(X).
-// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
- V = peekThroughBitcasts(V);
- if (V.getOpcode() == ISD::XOR &&
- ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
- return V.getOperand(0);
- if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
- if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
- Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
- Not, V.getOperand(1));
- }
- }
- SmallVector<SDValue, 2> CatOps;
- if (collectConcatOps(V.getNode(), CatOps)) {
- for (SDValue &CatOp : CatOps) {
- SDValue NotCat = IsNOT(CatOp, DAG);
- if (!NotCat) return SDValue();
- CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
- }
- return SDValue();
-}
-
/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::AND);
@@ -38273,7 +38903,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
unsigned ShiftVal = SplatVal.countTrailingOnes();
- SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
+ SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
return DAG.getBitcast(N->getValueType(0), Shift);
}
@@ -38499,7 +39129,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
- if (matchBitOpReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
SrcOps.size() == 1) {
SDLoc dl(N);
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
@@ -38570,7 +39200,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
}
if (SDValue Shuffle = combineX86ShufflesRecursively(
- {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
+ {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
/*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
N->getOperand(0).getOperand(1));
@@ -38585,7 +39215,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
- EVT VT = N->getValueType(0);
+ MVT VT = N->getSimpleValueType(0);
if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
return SDValue();
@@ -38594,10 +39224,12 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
return SDValue();
- // On XOP we'll lower to PCMOV so accept one use, otherwise only
- // do this if either mask has multiple uses already.
- if (!(Subtarget.hasXOP() || !N0.getOperand(1).hasOneUse() ||
- !N1.getOperand(1).hasOneUse()))
+ // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
+ // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
+ bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
+ Subtarget.hasVLX();
+ if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
+ !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
return SDValue();
// Attempt to extract constant byte masks.
@@ -38895,6 +39527,24 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
DAG.getBitcast(MVT::v4f32, N1)));
}
+ // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (Mask) {
+ APInt AllBits = APInt::getNullValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+ }
+ }
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -39136,26 +39786,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
}
-/// Check if truncation with saturation form type \p SrcVT to \p DstVT
-/// is valid for the given \p Subtarget.
-static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasAVX512())
- return false;
-
- // FIXME: Scalar type may be supported if we move it to vector register.
- if (!SrcVT.isVector())
- return false;
-
- EVT SrcElVT = SrcVT.getScalarType();
- EVT DstElVT = DstVT.getScalarType();
- if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
- return false;
- if (SrcVT.is512BitVector() || Subtarget.hasVLX())
- return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
- return false;
-}
-
/// Detect patterns of truncation with unsigned saturation:
///
/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
@@ -39253,64 +39883,61 @@ static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
return SDValue();
}
-/// Detect a pattern of truncation with signed saturation.
-/// The types should allow to use VPMOVSS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
- if (!TLI.isTypeLegal(In.getValueType()))
- return SDValue();
- if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
- return SDValue();
- return detectSSatPattern(In, VT);
-}
-
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
-/// The types should allow to use VPMOVUS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
- const SDLoc &DL,
- const X86Subtarget &Subtarget,
- const TargetLowering &TLI) {
- if (!TLI.isTypeLegal(In.getValueType()))
- return SDValue();
- if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
- return SDValue();
- return detectUSatPattern(In, VT, DAG, DL);
-}
-
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- EVT SVT = VT.getScalarType();
+ if (!Subtarget.hasSSE2() || !VT.isVector())
+ return SDValue();
+
+ EVT SVT = VT.getVectorElementType();
EVT InVT = In.getValueType();
- EVT InSVT = InVT.getScalarType();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
- isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
- if (auto SSatVal = detectSSatPattern(In, VT))
- return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
- if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
- return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
- }
- if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
- !Subtarget.hasAVX512() &&
+ EVT InSVT = InVT.getVectorElementType();
+
+ // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
+ // split across two registers. We can use a packusdw+perm to clamp to 0-65535
+ // and concatenate at the same time. Then we can use a final vpmovuswb to
+ // clip to 0-255.
+ if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ InVT == MVT::v16i32 && VT == MVT::v16i8) {
+ if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
+ SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
+ DL, DAG, Subtarget);
+ assert(Mid && "Failed to pack!");
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
+ }
+ }
+
+ // vXi32 truncate instructions are available with AVX512F.
+ // vXi16 truncate instructions are only available with AVX512BW.
+ // For 256-bit or smaller vectors, we require VLX.
+ // FIXME: We could widen truncates to 512 to remove the VLX restriction.
+ // If the result type is 256-bits or larger and we have disable 512-bit
+ // registers, we should go ahead and use the pack instructions if possible.
+ bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+ (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+ (InVT.getSizeInBits() > 128) &&
+ (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+ !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+
+ if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
+ VT.getSizeInBits() >= 64 &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
if (auto USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
+ // Only do this when the result is at least 64 bits or we'll leaving
+ // dangling PACKSSDW nodes.
if (SVT == MVT::i8 && InSVT == MVT::i32) {
EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
VT.getVectorNumElements());
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
DAG, Subtarget);
- if (Mid)
- return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
- Subtarget);
+ assert(Mid && "Failed to pack!");
+ SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
+ Subtarget);
+ assert(V && "Failed to pack!");
+ return V;
} else if (SVT == MVT::i8 || Subtarget.hasSSE41())
return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
Subtarget);
@@ -39319,6 +39946,42 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
Subtarget);
}
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
+ Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+ unsigned TruncOpc;
+ SDValue SatVal;
+ if (auto SSatVal = detectSSatPattern(In, VT)) {
+ SatVal = SSatVal;
+ TruncOpc = X86ISD::VTRUNCS;
+ } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
+ SatVal = USatVal;
+ TruncOpc = X86ISD::VTRUNCUS;
+ }
+ if (SatVal) {
+ unsigned ResElts = VT.getVectorNumElements();
+ // If the input type is less than 512 bits and we don't have VLX, we need
+ // to widen to 512 bits.
+ if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
+ unsigned NumConcats = 512 / InVT.getSizeInBits();
+ ResElts *= NumConcats;
+ SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
+ ConcatOps[0] = SatVal;
+ InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
+ NumConcats * InVT.getVectorNumElements());
+ SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
+ }
+ // Widen the result if its narrower than 128 bits.
+ if (ResElts * SVT.getSizeInBits() < 128)
+ ResElts = 128 / SVT.getSizeInBits();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
+ SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+
return SDValue();
}
@@ -39377,7 +40040,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
return true;
};
- // Check if each element of the vector is left-shifted by one.
+ // Check if each element of the vector is right-shifted by one.
auto LHS = In.getOperand(0);
auto RHS = In.getOperand(1);
if (!IsConstVectorInRange(RHS, 1, 1))
@@ -39679,90 +40342,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return Blend;
}
- if (Mld->getExtensionType() != ISD::EXTLOAD)
- return SDValue();
-
- // Resolve extending loads.
- EVT VT = Mld->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
- EVT LdVT = Mld->getMemoryVT();
- SDLoc dl(Mld);
-
- assert(LdVT != VT && "Cannot extend to the same type");
- unsigned ToSz = VT.getScalarSizeInBits();
- unsigned FromSz = LdVT.getScalarSizeInBits();
- // From/To sizes and ElemCount must be pow of two.
- assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
- "Unexpected size for extending masked load");
-
- unsigned SizeRatio = ToSz / FromSz;
- assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
- LdVT.getScalarType(), NumElems*SizeRatio);
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
- // Convert PassThru value.
- SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
- if (!Mld->getPassThru().isUndef()) {
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
- "WideVecVT should be legal");
- WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
- DAG.getUNDEF(WideVecVT), ShuffleVec);
- }
-
- // Prepare the new mask.
- SDValue NewMask;
- SDValue Mask = Mld->getMask();
- if (Mask.getValueType() == VT) {
- // Mask and original value have the same type.
- NewMask = DAG.getBitcast(WideVecVT, Mask);
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
- for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
- ShuffleVec[i] = NumElems * SizeRatio;
- NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
- DAG.getConstant(0, dl, WideVecVT),
- ShuffleVec);
- } else {
- assert(Mask.getValueType().getVectorElementType() == MVT::i1);
- unsigned WidenNumElts = NumElems*SizeRatio;
- unsigned MaskNumElts = VT.getVectorNumElements();
- EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WidenNumElts);
-
- unsigned NumConcat = WidenNumElts / MaskNumElts;
- SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
- SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
- Ops[0] = Mask;
- NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
- }
-
- SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
- Mld->getBasePtr(), NewMask, WidePassThru,
- Mld->getMemoryVT(), Mld->getMemOperand(),
- ISD::NON_EXTLOAD);
-
- SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i * SizeRatio] = i;
-
- // Can't shuffle using an illegal type.
- assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
- "WideVecVT should be legal");
- SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
- DAG.getUNDEF(WideVecVT), ShuffleVec);
- SlicedVec = DAG.getBitcast(VT, SlicedVec);
-
- return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
+ return SDValue();
}
/// If exactly one element of the mask is set for a non-truncating masked store,
@@ -39800,123 +40380,45 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = Mst->getValue().getValueType();
- EVT StVT = Mst->getMemoryVT();
SDLoc dl(Mst);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (!Mst->isTruncatingStore()) {
- if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
- return ScalarStore;
-
- // If the mask value has been legalized to a non-boolean vector, try to
- // simplify ops leading up to it. We only demand the MSB of each lane.
- SDValue Mask = Mst->getMask();
- if (Mask.getScalarValueSizeInBits() != 1) {
- APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
- return SDValue(N, 0);
- }
-
- // TODO: AVX512 targets should also be able to simplify something like the
- // pattern above, but that pattern will be different. It will either need to
- // match setcc more generally or match PCMPGTM later (in tablegen?).
-
- SDValue Value = Mst->getValue();
- if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
- TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
- Mst->getMemoryVT())) {
- return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(), true);
- }
-
+ if (Mst->isTruncatingStore())
return SDValue();
- }
-
- // Resolve truncating stores.
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromSz = VT.getScalarSizeInBits();
- unsigned ToSz = StVT.getScalarSizeInBits();
-
- // The truncating store is legal in some cases. For example
- // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
- // are designated for truncate store.
- // In this case we don't need any further transformations.
- if (TLI.isTruncStoreLegal(VT, StVT))
- return SDValue();
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+ return ScalarStore;
- // From/To sizes and ElemCount must be pow of two.
- assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
- "Unexpected size for truncating masked store");
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- assert (((NumElems * FromSz) % ToSz) == 0 &&
- "Unexpected ratio for truncating masked store");
-
- unsigned SizeRatio = FromSz / ToSz;
- assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
- StVT.getScalarType(), NumElems*SizeRatio);
-
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
- SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
-
- // Can't shuffle using an illegal type.
- assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
- "WideVecVT should be legal");
-
- SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVecVT),
- ShuffleVec);
-
- SDValue NewMask;
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
- if (Mask.getValueType() == VT) {
- // Mask and original value have the same type.
- NewMask = DAG.getBitcast(WideVecVT, Mask);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
- for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
- ShuffleVec[i] = NumElems*SizeRatio;
- NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
- DAG.getConstant(0, dl, WideVecVT),
- ShuffleVec);
- } else {
- assert(Mask.getValueType().getVectorElementType() == MVT::i1);
- unsigned WidenNumElts = NumElems*SizeRatio;
- unsigned MaskNumElts = VT.getVectorNumElements();
- EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
- WidenNumElts);
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ return SDValue(N, 0);
+ }
- unsigned NumConcat = WidenNumElts / MaskNumElts;
- SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
- SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
- Ops[0] = Mask;
- NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ SDValue Value = Mst->getValue();
+ if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
+ TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+ Mst->getMemoryVT())) {
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
+ Mst->getBasePtr(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(), true);
}
- return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
- Mst->getBasePtr(), NewMask, StVT,
- Mst->getMemOperand(), false);
+ return SDValue();
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
StoreSDNode *St = cast<StoreSDNode>(N);
- EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
unsigned Alignment = St->getAlignment();
- SDValue StoredVal = St->getOperand(1);
+ SDValue StoredVal = St->getValue();
+ EVT VT = StoredVal.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// Convert a store of vXi1 into a store of iX and a bitcast.
@@ -39986,8 +40488,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getMemOperand()->getFlags());
}
- // If we are saving a concatenation of two XMM registers and 32-byte stores
- // are slow, such as on Sandy Bridge, perform two 16-byte stores.
+ // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
+ // Sandy Bridge, perform two 16-byte stores.
bool Fast;
if (VT.is256BitVector() && StVT == VT &&
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
@@ -40026,13 +40528,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
St->getValue().getOpcode() == ISD::TRUNCATE &&
St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
- TLI.isTruncStoreLegalOrCustom(MVT::v16i32, MVT::v16i8) &&
- !DCI.isBeforeLegalizeOps()) {
+ TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
+ St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
MVT::v16i8, St->getMemOperand());
}
+ // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
+ if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
+ (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
+ StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
+ TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
+ bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
+ return EmitTruncSStore(IsSigned, St->getChain(),
+ dl, StoredVal.getOperand(0), St->getBasePtr(),
+ VT, St->getMemOperand(), DAG);
+ }
+
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
@@ -40040,100 +40553,26 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Check if we can detect an AVG pattern from the truncation. If yes,
// replace the trunc store by a normal store with the result of X86ISD::AVG
// instruction.
- if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
- Subtarget, dl))
- return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
-
- if (SDValue Val =
- detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
- TLI))
- return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
- dl, Val, St->getBasePtr(),
- St->getMemoryVT(), St->getMemOperand(), DAG);
- if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
- DAG, dl, Subtarget, TLI))
- return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
- dl, Val, St->getBasePtr(),
- St->getMemoryVT(), St->getMemOperand(), DAG);
-
- unsigned NumElems = VT.getVectorNumElements();
- assert(StVT != VT && "Cannot truncate to the same type");
- unsigned FromSz = VT.getScalarSizeInBits();
- unsigned ToSz = StVT.getScalarSizeInBits();
-
- // The truncating store is legal in some cases. For example
- // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
- // are designated for truncate store.
- // In this case we don't need any further transformations.
- if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
- return SDValue();
-
- // From, To sizes and ElemCount must be pow of two
- if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
- // We are going to use the original vector elt for storing.
- // Accumulated smaller vector elements must be a multiple of the store size.
- if (0 != (NumElems * FromSz) % ToSz) return SDValue();
-
- unsigned SizeRatio = FromSz / ToSz;
-
- assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
-
- // Create a type on which we perform the shuffle
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
- StVT.getScalarType(), NumElems*SizeRatio);
-
- assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
-
- SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
- SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i] = i * SizeRatio;
+ if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
+ if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
+ Subtarget, dl))
+ return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT))
- return SDValue();
-
- SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
- DAG.getUNDEF(WideVecVT),
- ShuffleVec);
- // At this point all of the data is stored at the bottom of the
- // register. We now need to save it to mem.
-
- // Find the largest store unit
- MVT StoreType = MVT::i8;
- for (MVT Tp : MVT::integer_valuetypes()) {
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
- StoreType = Tp;
- }
-
- // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
- if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
- (64 <= NumElems * ToSz))
- StoreType = MVT::f64;
-
- // Bitcast the original vector into a vector of store-size units
- EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
- assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
- SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
- SmallVector<SDValue, 8> Chains;
- SDValue Ptr = St->getBasePtr();
-
- // Perform one or more big stores into memory.
- for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
- StoreType, ShuffWide,
- DAG.getIntPtrConstant(i, dl));
- SDValue Ch =
- DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
- Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
- Chains.push_back(Ch);
+ if (TLI.isTruncStoreLegal(VT, StVT)) {
+ if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
+ return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
+ if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
+ DAG, dl))
+ return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+ dl, Val, St->getBasePtr(),
+ St->getMemoryVT(), St->getMemOperand(), DAG);
}
- return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+ return SDValue();
}
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
@@ -40149,11 +40588,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
- if (((VT.isVector() && !VT.isFloatingPoint()) ||
- (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
+ if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
isa<LoadSDNode>(St->getValue()) &&
- !cast<LoadSDNode>(St->getValue())->isVolatile() &&
- St->getChain().hasOneUse() && !St->isVolatile()) {
+ cast<LoadSDNode>(St->getValue())->isSimple() &&
+ St->getChain().hasOneUse() && St->isSimple()) {
LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
SmallVector<SDValue, 8> Ops;
@@ -40595,8 +41033,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // Requires SSE2 but AVX512 has fast truncate.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ // Requires SSE2.
+ if (!Subtarget.hasSSE2())
return SDValue();
if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
@@ -40620,6 +41058,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
+ // AVX512 has fast truncate, but if the input is already going to be split,
+ // there's no harm in trying pack.
+ if (Subtarget.hasAVX512() &&
+ !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
+ InVT.is512BitVector()))
+ return SDValue();
+
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
@@ -40658,9 +41103,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
// Only handle vXi16 types that are at least 128-bits unless they will be
// widened.
- if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
- (!ExperimentalVectorWideningLegalization &&
- VT.getVectorNumElements() < 8))
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
return SDValue();
// Input type should be vXi32.
@@ -40874,6 +41317,19 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return combineVectorTruncation(N, DAG, Subtarget);
}
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ SDValue In = N->getOperand(0);
+ SDLoc DL(N);
+
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
+ if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+
+ return SDValue();
+}
+
/// Returns the negated value if the node \p N flips sign of FP value.
///
/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
@@ -40883,10 +41339,14 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
/// In this case we go though all bitcasts.
/// This also recognizes splat of a negated value and returns the splat of that
/// value.
-static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
+static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
if (N->getOpcode() == ISD::FNEG)
return N->getOperand(0);
+ // Don't recurse exponentially.
+ if (Depth > SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
@@ -40900,7 +41360,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
if (!SVOp->getOperand(1).isUndef())
return SDValue();
- if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
+ if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1))
if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
SVOp->getMask());
@@ -40914,7 +41374,7 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
SDValue InsVal = Op.getOperand(1);
if (!InsVector.isUndef())
return SDValue();
- if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
+ if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
NegInsVal, Op.getOperand(2));
@@ -40951,6 +41411,57 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
return SDValue();
}
+static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
+ bool NegRes) {
+ if (NegMul) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ }
+ }
+
+ if (NegAcc) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+ }
+ }
+
+ if (NegRes) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode");
+ case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ }
+ }
+
+ return Opcode;
+}
+
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -40980,29 +41491,123 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
// If we're negating an FMA node, then we can adjust the
// instruction to include the extra negation.
- unsigned NewOpcode = 0;
if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
switch (Arg.getOpcode()) {
- case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
- case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
- case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
- // We can't handle scalar intrinsic node here because it would only
- // invert one element and not the whole vector. But we could try to handle
- // a negation of the lower element only.
- }
- }
- if (NewOpcode)
- return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
- Arg.getNode()->ops()));
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ // We can't handle scalar intrinsic node here because it would only
+ // invert one element and not the whole vector. But we could try to handle
+ // a negation of the lower element only.
+ unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
+ return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
+ }
+ }
+ }
return SDValue();
}
+char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations,
+ bool ForCodeSize,
+ unsigned Depth) const {
+ // fneg patterns are removable even if they have multiple uses.
+ if (isFNEG(DAG, Op.getNode(), Depth))
+ return 2;
+
+ // Don't recurse exponentially.
+ if (Depth > SelectionDAG::MaxRecursionDepth)
+ return 0;
+
+ EVT VT = Op.getValueType();
+ EVT SVT = VT.getScalarType();
+ switch (Op.getOpcode()) {
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+ !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+ break;
+
+ // This is always negatible for free but we might be able to remove some
+ // extra operand negations as well.
+ for (int i = 0; i != 3; ++i) {
+ char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
+ ForCodeSize, Depth + 1);
+ if (V == 2)
+ return V;
+ }
+ return 1;
+ }
+ }
+
+ return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
+ ForCodeSize, Depth);
+}
+
+SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations,
+ bool ForCodeSize,
+ unsigned Depth) const {
+ // fneg patterns are removable even if they have multiple uses.
+ if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
+ return DAG.getBitcast(Op.getValueType(), Arg);
+
+ EVT VT = Op.getValueType();
+ EVT SVT = VT.getScalarType();
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::FMA:
+ case X86ISD::FMSUB:
+ case X86ISD::FNMADD:
+ case X86ISD::FNMSUB:
+ case X86ISD::FMADD_RND:
+ case X86ISD::FMSUB_RND:
+ case X86ISD::FNMADD_RND:
+ case X86ISD::FNMSUB_RND: {
+ if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
+ !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+ break;
+
+ // This is always negatible for free but we might be able to remove some
+ // extra operand negations as well.
+ SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
+ for (int i = 0; i != 3; ++i) {
+ char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
+ ForCodeSize, Depth + 1);
+ if (V == 2)
+ NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
+ ForCodeSize, Depth + 1);
+ }
+
+ bool NegA = !!NewOps[0];
+ bool NegB = !!NewOps[1];
+ bool NegC = !!NewOps[2];
+ unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
+
+ // Fill in the non-negated ops with the original values.
+ for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
+ if (!NewOps[i])
+ NewOps[i] = Op.getOperand(i);
+ return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
+ }
+ }
+
+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+ ForCodeSize, Depth);
+}
+
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = N->getSimpleValueType(0);
@@ -41312,8 +41917,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- // Unless the load is volatile.
- if (!LN->isVolatile()) {
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
SDLoc dl(N);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getIntegerVT(NumBits);
@@ -41347,8 +41952,8 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- // Unless the load is volatile.
- if (!LN->isVolatile()) {
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
SDLoc dl(N);
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
MVT MemVT = MVT::getFloatingPointVT(NumBits);
@@ -41724,127 +42329,6 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(EltSizeInBits - 1, DL, VT));
}
-/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
-/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
-/// with UNDEFs) of the input to vectors of the same size as the target type
-/// which then extends the lowest elements.
-static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- if (ExperimentalVectorWideningLegalization)
- return SDValue();
-
- unsigned Opcode = N->getOpcode();
- // TODO - add ANY_EXTEND support.
- if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
- return SDValue();
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- EVT VT = N->getValueType(0);
- EVT SVT = VT.getScalarType();
- EVT InVT = N0.getValueType();
- EVT InSVT = InVT.getScalarType();
-
- // FIXME: Generic DAGCombiner previously had a bug that would cause a
- // sign_extend of setcc to sometimes return the original node and tricked it
- // into thinking CombineTo was used which prevented the target combines from
- // running.
- // Earlying out here to avoid regressions like this
- // (v4i32 (sext (v4i1 (setcc (v4i16)))))
- // Becomes
- // (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
- // Type legalized to
- // (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
- // Leading to a packssdw+pmovsxwd
- // We could write a DAG combine to fix this, but really we shouldn't be
- // creating sext_invec that's forcing v8i16 into the DAG.
- if (N0.getOpcode() == ISD::SETCC)
- return SDValue();
-
- // Input type must be a vector and we must be extending legal integer types.
- if (!VT.isVector() || VT.getVectorNumElements() < 2)
- return SDValue();
- if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
- return SDValue();
- if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
- return SDValue();
-
- // If the input/output types are both legal then we have at least AVX1 and
- // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
- if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- DAG.getTargetLoweringInfo().isTypeLegal(InVT))
- return SDValue();
-
- SDLoc DL(N);
-
- auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
- EVT SrcVT = N.getValueType();
- EVT DstVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
- Size / SrcVT.getScalarSizeInBits());
- SmallVector<SDValue, 8> Opnds(Size / SrcVT.getSizeInBits(),
- DAG.getUNDEF(SrcVT));
- Opnds[0] = N;
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Opnds);
- };
-
- // If target-size is less than 128-bits, extend to a type that would extend
- // to 128 bits, extend that and extract the original target vector.
- if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
- unsigned Scale = 128 / VT.getSizeInBits();
- EVT ExVT =
- EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
- SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
- SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // If target-size is 128-bits (or 256-bits on AVX target), then convert to
- // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
- // Also use this if we don't have SSE41 to allow the legalizer do its job.
- if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.hasAVX()) ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
- SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
- Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- return DAG.getNode(Opcode, DL, VT, ExOp);
- }
-
- auto SplitAndExtendInReg = [&](unsigned SplitSize) {
- unsigned NumVecs = VT.getSizeInBits() / SplitSize;
- unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
- EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
- EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
-
- unsigned IROpc = getOpcode_EXTEND_VECTOR_INREG(Opcode);
- SmallVector<SDValue, 8> Opnds;
- for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
- SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
- DAG.getIntPtrConstant(Offset, DL));
- SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
- SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
- Opnds.push_back(SrcVec);
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
- };
-
- // On pre-AVX targets, split into 128-bit nodes of
- // ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
- return SplitAndExtendInReg(128);
-
- // On pre-AVX512 targets, split into 256-bit nodes of
- // ISD::*_EXTEND_VECTOR_INREG.
- if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
- return SplitAndExtendInReg(256);
-
- return SDValue();
-}
-
// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
// result type.
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
@@ -41915,9 +42399,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
}
- if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
- return V;
-
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -41931,45 +42412,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
- if (NegMul) {
- switch (Opcode) {
- default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMADD: Opcode = ISD::FMA; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
- }
- }
-
- if (NegAcc) {
- switch (Opcode) {
- default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FMSUB: Opcode = ISD::FMA; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
- }
- }
-
- return Opcode;
-}
-
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
// Let legalize expand this if it isn't a legal type yet.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(VT))
return SDValue();
EVT ScalarVT = VT.getScalarType();
@@ -41980,17 +42431,21 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
SDValue B = N->getOperand(1);
SDValue C = N->getOperand(2);
- auto invertIfNegative = [&DAG](SDValue &V) {
- if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
- V = DAG.getBitcast(V.getValueType(), NegVal);
+ auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+ if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
+ V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
return true;
}
// Look through extract_vector_elts. If it comes from an FNEG, create a
// new extract from the FNEG input.
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isNullConstant(V.getOperand(1))) {
- if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
- NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
+ SDValue Vec = V.getOperand(0);
+ if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
+ SDValue NegVal =
+ TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
NegVal, V.getOperand(1));
return true;
@@ -42009,7 +42464,8 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if (!NegA && !NegB && !NegC)
return SDValue();
- unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
+ unsigned NewOpcode =
+ negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
@@ -42017,33 +42473,27 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
+// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
- SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
- if (!NegVal)
+ SDValue N2 = N->getOperand(2);
+ if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
return SDValue();
- // FIXME: Should we bitcast instead?
- if (NegVal.getValueType() != VT)
- return SDValue();
-
- unsigned NewOpcode;
- switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected opcode!");
- case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
- case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
- case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
- case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
- }
+ SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
- NegVal, N->getOperand(3));
+ NegN2, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
- NegVal);
+ NegN2);
}
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
@@ -42090,9 +42540,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
return V;
- if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
- return V;
-
if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
return V;
@@ -42111,12 +42558,11 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
- unsigned NumSrcElts = N00.getValueType().getVectorNumElements();
unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
- return concatSubVectors(N00, N01, VT, NumSrcElts * 2, DAG, dl, 128);
+ return concatSubVectors(N00, N01, DAG, dl);
}
}
@@ -42159,16 +42605,30 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
!IsOrXorXorCCZero)
return SDValue();
- // TODO: Use PXOR + PTEST for SSE4.1 or later?
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
+ bool HasAVX = Subtarget.hasAVX();
+
+ // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512.
+ // Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && Subtarget.hasAVX2()) ||
+ (OpSize == 256 && HasAVX) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
- EVT VecVT = OpSize == 512 ? MVT::v16i32 :
- OpSize == 256 ? MVT::v32i8 :
- MVT::v16i8;
- EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
+ bool HasPT = Subtarget.hasSSE41();
+ EVT VecVT = MVT::v16i8;
+ EVT CmpVT = MVT::v16i8;
+ if (OpSize == 256)
+ VecVT = CmpVT = MVT::v32i8;
+ if (OpSize == 512) {
+ if (Subtarget.hasBWI()) {
+ VecVT = MVT::v64i8;
+ CmpVT = MVT::v64i1;
+ } else {
+ VecVT = MVT::v16i32;
+ CmpVT = MVT::v16i1;
+ }
+ }
+
SDValue Cmp;
if (IsOrXorXorCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
@@ -42179,18 +42639,38 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
- SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
- SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
- Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
+ if (VecVT == CmpVT && HasPT) {
+ SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B);
+ SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D);
+ Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2);
+ } else {
+ SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+ SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
+ Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
+ }
} else {
SDValue VecX = DAG.getBitcast(VecVT, X);
SDValue VecY = DAG.getBitcast(VecVT, Y);
- Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+ if (VecVT == CmpVT && HasPT) {
+ Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
+ } else {
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+ }
}
// For 512-bits we want to emit a setcc that will lower to kortest.
- if (OpSize == 512)
- return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
- DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
+ if (VecVT != CmpVT) {
+ EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16;
+ SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT);
+ return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC);
+ }
+ if (HasPT) {
+ SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
+ Cmp);
+ SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
+ X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+ SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
+ }
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
@@ -42270,8 +42750,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
// go through type promotion to a 128-bit vector.
if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
VT.getVectorElementType() == MVT::i1 &&
- (ExperimentalVectorWideningLegalization ||
- VT.getVectorNumElements() > 4) &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
@@ -42289,7 +42767,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue Src = N->getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = N->getSimpleValueType(0);
@@ -42310,7 +42789,7 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
// Look through int->fp bitcasts that don't change the element width.
unsigned EltWidth = SrcVT.getScalarSizeInBits();
- if (Src.getOpcode() == ISD::BITCAST &&
+ if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
@@ -42334,71 +42813,123 @@ static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // With vector masks we only demand the upper bit of the mask.
+ SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
+ auto *GorS = cast<MaskedGatherScatterSDNode>(N);
+ SDValue Chain = GorS->getChain();
+ SDValue Index = GorS->getIndex();
+ SDValue Mask = GorS->getMask();
+ SDValue Base = GorS->getBasePtr();
+ SDValue Scale = GorS->getScale();
- if (DCI.isBeforeLegalizeOps()) {
- SDValue Index = N->getOperand(4);
- // Remove any sign extends from 32 or smaller to larger than 32.
- // Only do this before LegalizeOps in case we need the sign extend for
- // legalization.
- if (Index.getOpcode() == ISD::SIGN_EXTEND) {
- if (Index.getScalarValueSizeInBits() > 32 &&
- Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index.getOperand(0);
- SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
- if (Res == N) {
- // The original sign extend has less users, add back to worklist in
- // case it needs to be removed
- DCI.AddToWorklist(Index.getNode());
- DCI.AddToWorklist(N);
+ if (DCI.isBeforeLegalize()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
+
+ // Shrink constant indices if they are larger than 32-bits.
+ // Only do this before legalize types since v2i64 could become v2i32.
+ // FIXME: We could check that the type is legal if we're after legalize
+ // types, but then we would need to construct test cases where that happens.
+ // FIXME: We could support more than just constant vectors, but we need to
+ // careful with costing. A truncate that can be optimized out would be fine.
+ // Otherwise we might only want to create a truncate if it avoids a split.
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
+ if (BV->isConstant() && IndexWidth > 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Chain, Gather->getPassThru(),
+ Mask, Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
}
- return SDValue(Res, 0);
- }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Chain, Scatter->getValue(),
+ Mask, Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
+ }
+ }
+
+ // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
+ // there are sufficient sign bits. Only do this before legalize types to
+ // avoid creating illegal types in truncate.
+ if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
+ Index.getOpcode() == ISD::ZERO_EXTEND) &&
+ IndexWidth > 32 &&
+ Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Chain, Gather->getPassThru(),
+ Mask, Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Chain, Scatter->getValue(),
+ Mask, Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
}
+ }
+
+ if (DCI.isBeforeLegalizeOps()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
// Make sure the index is either i32 or i64
- unsigned ScalarSize = Index.getScalarValueSizeInBits();
- if (ScalarSize != 32 && ScalarSize != 64) {
- MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
+ if (IndexWidth != 32 && IndexWidth != 64) {
+ MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index;
- SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
- if (Res == N)
- DCI.AddToWorklist(N);
- return SDValue(Res, 0);
- }
-
- // Try to remove zero extends from 32->64 if we know the sign bit of
- // the input is zero.
- if (Index.getOpcode() == ISD::ZERO_EXTEND &&
- Index.getScalarValueSizeInBits() == 64 &&
- Index.getOperand(0).getScalarValueSizeInBits() == 32) {
- if (DAG.SignBitIsZero(Index.getOperand(0))) {
- SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
- NewOps[4] = Index.getOperand(0);
- SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
- if (Res == N) {
- // The original sign extend has less users, add back to worklist in
- // case it needs to be removed
- DCI.AddToWorklist(Index.getNode());
- DCI.AddToWorklist(N);
- }
- return SDValue(Res, 0);
- }
- }
- }
-
- // With AVX2 we only demand the upper bit of the mask.
- if (!Subtarget.hasAVX512()) {
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Chain, Gather->getPassThru(),
+ Mask, Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Chain, Scatter->getValue(),
+ Mask, Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
+ }
+ }
+
+ // With vector masks we only demand the upper bit of the mask.
+ if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Mask = N->getOperand(2);
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
return SDValue(N, 0);
@@ -42432,7 +42963,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
// Make sure to not keep references to operands, as combineSetCCEFLAGS can
// RAUW them under us.
if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
- SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
+ SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
N->getOperand(1), Cond, Flags);
}
@@ -42549,6 +43080,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
}
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
@@ -42578,13 +43110,22 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
unsigned BitWidth = InVT.getScalarSizeInBits();
unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
if (NumSignBits >= (BitWidth - 31)) {
- EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
+ EVT TruncVT = MVT::i32;
if (InVT.isVector())
TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
InVT.getVectorNumElements());
SDLoc dl(N);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
- return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
+ }
+ // If we're after legalize and the type is v2i32 we need to shuffle and
+ // use CVTSI2P.
+ assert(InVT == MVT::v2i64 && "Unexpected VT!");
+ SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
+ { 0, 2, -1, -1 });
+ return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
}
}
@@ -42604,7 +43145,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasDQI() && VT != MVT::f80)
return SDValue();
- if (!Ld->isVolatile() && !VT.isVector() &&
+ if (Ld->isSimple() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!Subtarget.is64Bit() && LdVT == MVT::i64) {
SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
@@ -42841,12 +43382,12 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
- SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL,
- MVT::i8),
- N->getOperand(2)),
- DAG.getConstant(1, DL, VT));
+ SDValue Res1 =
+ DAG.getNode(ISD::AND, DL, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ N->getOperand(2)),
+ DAG.getConstant(1, DL, VT));
return DCI.CombineTo(N, Res1, CarryOut);
}
@@ -42906,7 +43447,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
Y.getOperand(1));
}
@@ -42924,7 +43465,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
NewEFLAGS);
}
}
@@ -42984,7 +43525,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
SDValue(Neg.getNode(), 1));
}
@@ -42997,7 +43538,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
SDValue One = DAG.getConstant(1, DL, ZVT);
SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
}
}
@@ -43025,9 +43566,6 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
if (!Subtarget.hasSSE2())
return SDValue();
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
-
EVT VT = N->getValueType(0);
// If the vector size is less than 128, or greater than the supported RegSize,
@@ -43035,14 +43573,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector() || VT.getVectorNumElements() < 8)
return SDValue();
- if (Op0.getOpcode() != ISD::MUL)
- std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::MUL)
- return SDValue();
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
- ShrinkMode Mode;
- if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
- return SDValue();
+ auto UsePMADDWD = [&](SDValue Op) {
+ ShrinkMode Mode;
+ return Op.getOpcode() == ISD::MUL &&
+ canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 &&
+ (!Subtarget.hasSSE41() ||
+ (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+ Op->isOnlyUserOf(Op.getOperand(1).getNode())));
+ };
+
+ SDValue MulOp, OtherOp;
+ if (UsePMADDWD(Op0)) {
+ MulOp = Op0;
+ OtherOp = Op1;
+ } else if (UsePMADDWD(Op1)) {
+ MulOp = Op1;
+ OtherOp = Op0;
+ } else
+ return SDValue();
SDLoc DL(N);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
@@ -43050,34 +43601,27 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
VT.getVectorNumElements() / 2);
+ // Shrink the operands of mul.
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
// Madd vector size is half of the original vector size
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
};
-
- auto BuildPMADDWD = [&](SDValue Mul) {
- // Shrink the operands of mul.
- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
-
- SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
- PMADDWDBuilder);
- // Fill the rest of the output with 0
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
- DAG.getConstant(0, DL, MAddVT));
- };
-
- Op0 = BuildPMADDWD(Op0);
-
- // It's possible that Op1 is also a mul we can reduce.
- if (Op1.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
- Op1 = BuildPMADDWD(Op1);
- }
-
- return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+ SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
+ PMADDWDBuilder);
+ // Fill the rest of the output with 0
+ SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+
+ // Preserve the reduction flag on the ADD. We may need to revisit for the
+ // other operand.
+ SDNodeFlags Flags;
+ Flags.setVectorReduction(true);
+ return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
}
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -43087,8 +43631,6 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
EVT VT = N->getValueType(0);
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
// TODO: There's nothing special about i32, any integer type above i16 should
// work just as well.
@@ -43108,80 +43650,53 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
if (VT.getSizeInBits() / 4 > RegSize)
return SDValue();
- // We know N is a reduction add, which means one of its operands is a phi.
- // To match SAD, we need the other operand to be a ABS.
- if (Op0.getOpcode() != ISD::ABS)
- std::swap(Op0, Op1);
- if (Op0.getOpcode() != ISD::ABS)
- return SDValue();
-
- auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
- // SAD pattern detected. Now build a SAD instruction and an addition for
- // reduction. Note that the number of elements of the result of SAD is less
- // than the number of elements of its input. Therefore, we could only update
- // part of elements in the reduction vector.
- SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
-
- // The output of PSADBW is a vector of i64.
- // We need to turn the vector of i64 into a vector of i32.
- // If the reduction vector is at least as wide as the psadbw result, just
- // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
- // anyway.
- MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
- if (VT.getSizeInBits() >= ResVT.getSizeInBits())
- Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
- else
- Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
-
- if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
- // Fill the upper elements with zero to match the add width.
- SDValue Zero = DAG.getConstant(0, DL, VT);
- Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
- DAG.getIntPtrConstant(0, DL));
- }
-
- return Sad;
- };
+ // We know N is a reduction add. To match SAD, we need one of the operands to
+ // be an ABS.
+ SDValue AbsOp = N->getOperand(0);
+ SDValue OtherOp = N->getOperand(1);
+ if (AbsOp.getOpcode() != ISD::ABS)
+ std::swap(AbsOp, OtherOp);
+ if (AbsOp.getOpcode() != ISD::ABS)
+ return SDValue();
// Check whether we have an abs-diff pattern feeding into the select.
SDValue SadOp0, SadOp1;
- if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
- return SDValue();
-
- Op0 = BuildPSADBW(SadOp0, SadOp1);
-
- // It's possible we have a sad on the other side too.
- if (Op1.getOpcode() == ISD::ABS &&
- detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
- Op1 = BuildPSADBW(SadOp0, SadOp1);
- }
-
- return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
-}
-
-/// Convert vector increment or decrement to sub/add with an all-ones constant:
-/// add X, <1, 1...> --> sub X, <-1, -1...>
-/// sub X, <1, 1...> --> add X, <-1, -1...>
-/// The all-ones vector constant can be materialized using a pcmpeq instruction
-/// that is commonly recognized as an idiom (has no register dependency), so
-/// that's better/smaller than loading a splat 1 constant.
-static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
- assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
- "Unexpected opcode for increment/decrement transform");
-
- // Pseudo-legality check: getOnesVector() expects one of these types, so bail
- // out and wait for legalization if we have an unsupported vector length.
- EVT VT = N->getValueType(0);
- if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
- return SDValue();
-
- APInt SplatVal;
- if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
- return SDValue();
-
- SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
- unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
- return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+ if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
+ return SDValue();
+
+ // SAD pattern detected. Now build a SAD instruction and an addition for
+ // reduction. Note that the number of elements of the result of SAD is less
+ // than the number of elements of its input. Therefore, we could only update
+ // part of elements in the reduction vector.
+ SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
+
+ // The output of PSADBW is a vector of i64.
+ // We need to turn the vector of i64 into a vector of i32.
+ // If the reduction vector is at least as wide as the psadbw result, just
+ // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
+ // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
+ // result to v2i32 which will be removed by type legalization. If we/ widen
+ // narrow vectors then we bitcast to v4i32 and extract v2i32.
+ MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
+ Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
+
+ if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
+ // Fill the upper elements with zero to match the add width.
+ assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
+ unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
+ Ops[0] = Sad;
+ Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
+ } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
+ Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Preserve the reduction flag on the ADD. We may need to revisit for the
+ // other operand.
+ SDNodeFlags Flags;
+ Flags.setVectorReduction(true);
+ return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
}
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -43294,8 +43809,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
}
// Attempt to turn this pattern into PMADDWD.
-// (mul (add (zext (build_vector)), (zext (build_vector))),
-// (add (zext (build_vector)), (zext (build_vector)))
+// (mul (add (sext (build_vector)), (sext (build_vector))),
+// (add (sext (build_vector)), (sext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
@@ -43415,6 +43930,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
}
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
if (Flags.hasVectorReduction()) {
@@ -43445,8 +43961,29 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
HADDBuilder);
}
- if (SDValue V = combineIncDecVector(N, DAG))
- return V;
+ // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+ // (sub Y, (sext (vXi1 X))).
+ // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
+ // generic DAG combine without a legal type check, but adding this there
+ // caused regressions.
+ if (VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
+ Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
+ }
+
+ if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
+ Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
+ SDLoc DL(N);
+ SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
+ return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
+ }
+ }
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -43457,13 +43994,15 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
// PSUBUS is supported, starting from SSE2, but truncation for v8i32
// is only worth it with SSSE3 (PSHUFB).
- if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
+ EVT EltVT = VT.getVectorElementType();
+ if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
!(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
- !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
- !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
- VT == MVT::v16i32 || VT == MVT::v8i64)))
+ !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
return SDValue();
SDValue SubusLHS, SubusRHS;
@@ -43493,16 +44032,13 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
} else
return SDValue();
- auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
- };
-
// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
// special preprocessing in some cases.
- if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
- { SubusLHS, SubusRHS }, USUBSATBuilder);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
+
+ assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&
+ "Unexpected VT!");
// Special preprocessing case can be only applied
// if the value was zero extended from 16 bit,
@@ -43531,15 +44067,16 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SDValue NewSubusLHS =
DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
- SDValue Psubus =
- SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
- { NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
+ SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
+ NewSubusLHS, NewSubusRHS);
+
// Zero extend the result, it may be used somewhere as 32 bit,
// if not zext and following trunc will shrink.
return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
}
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -43576,9 +44113,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
HSUBBuilder);
}
- if (SDValue V = combineIncDecVector(N, DAG))
- return V;
-
// Try to create PSUBUS if SUB's argument is max/min
if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
return V;
@@ -43712,14 +44246,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
}
- // If we're inserting all zeros into the upper half, change this to
- // an insert into an all zeros vector. We will match this to a move
- // with implicit upper bit zeroing during isel.
- if (Ops.size() == 2 && ISD::isBuildVectorAllZeros(Ops[1].getNode()))
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
- getZeroVector(VT, Subtarget, DAG, DL), Ops[0],
- DAG.getIntPtrConstant(0, DL));
-
return SDValue();
}
@@ -43786,10 +44312,10 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// least as large as the original insertion. Just insert the original
// subvector into a zero vector.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
- SubVec.getConstantOperandAPInt(1) == 0 &&
+ isNullConstant(SubVec.getOperand(1)) &&
SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Ins = SubVec.getOperand(0);
- if (Ins.getConstantOperandAPInt(2) == 0 &&
+ if (isNullConstant(Ins.getOperand(2)) &&
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
@@ -43825,31 +44351,42 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// Match concat_vector style patterns.
SmallVector<SDValue, 2> SubVectorOps;
- if (collectConcatOps(N, SubVectorOps))
+ if (collectConcatOps(N, SubVectorOps)) {
if (SDValue Fold =
combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
return Fold;
- // If we are inserting into both halves of the vector, the starting vector
- // should be undef. If it isn't, make it so. Only do this if the early insert
- // has no other uses.
- // TODO: Should this be a generic DAG combine?
- // TODO: Why doesn't SimplifyDemandedVectorElts catch this?
- if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
- Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
- OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2 &&
- isNullConstant(Vec.getOperand(2)) && !Vec.getOperand(0).isUndef() &&
- Vec.hasOneUse()) {
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
- Vec.getOperand(1), Vec.getOperand(2));
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
- N->getOperand(2));
+ // If we're inserting all zeros into the upper half, change this to
+ // a concat with zero. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ // We do this here because we don't want combineConcatVectorOps to
+ // create INSERT_SUBVECTOR from CONCAT_VECTORS.
+ if (SubVectorOps.size() == 2 &&
+ ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
}
// If this is a broadcast insert into an upper undef, use a larger broadcast.
if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
+ // If this is a broadcast load inserted into an upper undef, use a larger
+ // broadcast load.
+ if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
+ SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
+ SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
+ SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+
return SDValue();
}
@@ -43928,12 +44465,15 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
return SDValue();
MVT VT = N->getSimpleValueType(0);
- EVT WideVecVT = N->getOperand(0).getValueType();
- SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
+ SDValue InVec = N->getOperand(0);
+ SDValue InVecBC = peekThroughBitcasts(InVec);
+ EVT InVecVT = InVec.getValueType();
+ EVT InVecBCVT = InVecBC.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
- TLI.isTypeLegal(WideVecVT) &&
- WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
+ TLI.isTypeLegal(InVecVT) &&
+ InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
auto isConcatenatedNot = [] (SDValue V) {
V = peekThroughBitcasts(V);
if (!isBitwiseNot(V))
@@ -43941,12 +44481,12 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue NotOp = V->getOperand(0);
return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
};
- if (isConcatenatedNot(WideVec.getOperand(0)) ||
- isConcatenatedNot(WideVec.getOperand(1))) {
+ if (isConcatenatedNot(InVecBC.getOperand(0)) ||
+ isConcatenatedNot(InVecBC.getOperand(1))) {
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
- SDValue Concat = split256IntArith(WideVec, DAG);
+ SDValue Concat = split256IntArith(InVecBC, DAG);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
- DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
+ DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
}
}
@@ -43956,7 +44496,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (SDValue V = narrowExtractedVectorSelect(N, DAG))
return V;
- SDValue InVec = N->getOperand(0);
unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
@@ -43976,31 +44515,42 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
// Try to move vector bitcast after extract_subv by scaling extraction index:
// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
// TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
- if (InVec.getOpcode() == ISD::BITCAST &&
- InVec.getOperand(0).getValueType().isVector()) {
- SDValue SrcOp = InVec.getOperand(0);
- EVT SrcVT = SrcOp.getValueType();
- unsigned SrcNumElts = SrcVT.getVectorNumElements();
- unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
+ if (InVec != InVecBC && InVecBCVT.isVector()) {
+ unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
+ unsigned DestNumElts = InVecVT.getVectorNumElements();
if ((DestNumElts % SrcNumElts) == 0) {
unsigned DestSrcRatio = DestNumElts / SrcNumElts;
if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
- SrcVT.getScalarType(), NewExtNumElts);
+ InVecBCVT.getScalarType(), NewExtNumElts);
if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
SDLoc DL(N);
SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
- SrcOp, NewIndex);
+ InVecBC, NewIndex);
return DAG.getBitcast(VT, NewExtract);
}
}
}
}
+ // If we are extracting from an insert into a zero vector, replace with a
+ // smaller insert into zero if we don't access less than the original
+ // subvector. Don't do this for i1 vectors.
+ if (VT.getVectorElementType() != MVT::i1 &&
+ InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
+ InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
+ ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+ InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ getZeroVector(VT, Subtarget, DAG, DL),
+ InVec.getOperand(1), InVec.getOperand(2));
+ }
+
// If we're extracting from a broadcast then we're better off just
// broadcasting to the smaller type directly, assuming this is the only use.
// As its a broadcast we don't care about the extraction index.
@@ -44008,11 +44558,25 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
+ if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
+ if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
unsigned InOpcode = InVec.getOpcode();
- if (VT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
+ if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
// v2f64 CVTDQ2PD(v4i32).
if (InOpcode == ISD::SINT_TO_FP &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
@@ -44093,7 +44657,8 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
// Simplify PMULDQ and PMULUDQ operations.
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -44103,23 +44668,43 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
// Multiply by zero.
+ // Don't return RHS as it may contain UNDEFs.
if (ISD::isBuildVectorAllZeros(RHS.getNode()))
- return RHS;
-
- // Aggressively peek through ops to get at the demanded low bits.
- APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
- SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
- if (DemandedLHS || DemandedRHS)
- return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
- DemandedLHS ? DemandedLHS : LHS,
- DemandedRHS ? DemandedRHS : RHS);
+ return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
return SDValue(N, 0);
+ // If the input is an extend_invec and the SimplifyDemandedBits call didn't
+ // convert it to any_extend_invec, due to the LegalOperations check, do the
+ // conversion directly to a vector shuffle manually. This exposes combine
+ // opportunities missed by combineExtInVec not calling
+ // combineX86ShufflesRecursively on SSE4.1 targets.
+ // FIXME: This is basically a hack around several other issues related to
+ // ANY_EXTEND_VECTOR_INREG.
+ if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
+ (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ LHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
+ LHS.getOperand(0), { 0, -1, 1, -1 });
+ LHS = DAG.getBitcast(MVT::v2i64, LHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+ if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
+ (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+ RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
+ RHS.getOperand(0).getValueType() == MVT::v4i32) {
+ SDLoc dl(N);
+ RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
+ RHS.getOperand(0), { 0, -1, 1, -1 });
+ RHS = DAG.getBitcast(MVT::v2i64, RHS);
+ return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
+ }
+
return SDValue();
}
@@ -44134,7 +44719,7 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
In.hasOneUse()) {
auto *Ld = cast<LoadSDNode>(In);
- if (!Ld->isVolatile()) {
+ if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
@@ -44150,17 +44735,6 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
}
}
- // Disabling for widening legalization for now. We can enable if we find a
- // case that needs it. Otherwise it can be deleted when we switch to
- // widening legalization.
- if (ExperimentalVectorWideningLegalization)
- return SDValue();
-
- // Combine (ext_invec (ext_invec X)) -> (ext_invec X)
- if (In.getOpcode() == N->getOpcode() &&
- TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getOperand(0).getValueType()))
- return DAG.getNode(N->getOpcode(), SDLoc(N), VT, In.getOperand(0));
-
// Attempt to combine as a shuffle.
// TODO: SSE41 support
if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
@@ -44173,6 +44747,20 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+ if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -44196,8 +44784,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
case X86ISD::CMP: return combineCMP(N, DAG);
- case ISD::ADD: return combineAdd(N, DAG, Subtarget);
- case ISD::SUB: return combineSub(N, DAG, Subtarget);
+ case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
+ case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
case X86ISD::SBB: return combineSBB(N, DAG);
@@ -44214,12 +44802,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
- case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget);
case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
+ case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
@@ -44299,20 +44888,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB:
case X86ISD::FNMSUB_RND:
- case ISD::FMA: return combineFMA(N, DAG, Subtarget);
+ case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
- case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
- case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
+ case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
+ case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
case X86ISD::MGATHER:
- case X86ISD::MSCATTER:
+ case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
case ISD::MGATHER:
- case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
+ case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
case X86ISD::PMULDQ:
- case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
+ case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
+ case X86ISD::KSHIFTL:
+ case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
}
return SDValue();
@@ -44660,10 +45251,11 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
case 'I':
case 'J':
case 'K':
- case 'L':
- case 'M':
case 'N':
case 'G':
+ case 'L':
+ case 'M':
+ return C_Immediate;
case 'C':
case 'e':
case 'Z':
@@ -45175,8 +45767,9 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
- // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
- // Vector types.
+ // TODO: Handle i128 in FR128RegClass after it is tested well.
+ // Vector types and fp128.
+ case MVT::f128:
case MVT::v16i8:
case MVT::v8i16:
case MVT::v4i32:
@@ -45469,7 +46062,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
@@ -45514,3 +46107,16 @@ X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
}
+
+unsigned
+X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
+ // The default stack probe size is 4096 if the function has no stackprobesize
+ // attribute.
+ unsigned StackProbeSize = 4096;
+ const Function &Fn = MF.getFunction();
+ if (Fn.hasFnAttribute("stack-probe-size"))
+ Fn.getFnAttribute("stack-probe-size")
+ .getValueAsString()
+ .getAsInteger(0, StackProbeSize);
+ return StackProbeSize;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index e0be03bc3f9d..6f7e90008de4 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -17,7 +17,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Target/TargetOptions.h"
namespace llvm {
class X86Subtarget;
@@ -144,6 +143,10 @@ namespace llvm {
/// relative displacements.
WrapperRIP,
+ /// Copies a 64-bit value from an MMX vector to the low word
+ /// of an XMM vector, with the high word zero filled.
+ MOVQ2DQ,
+
/// Copies a 64-bit value from the low word of an XMM vector
/// to an MMX vector.
MOVDQ2Q,
@@ -422,7 +425,8 @@ namespace llvm {
// Tests Types Of a FP Values for scalar types.
VFPCLASSS,
- // Broadcast scalar to vector.
+ // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+ // a vector, this node may change the vector length as part of the splat.
VBROADCAST,
// Broadcast mask to vector.
VBROADCASTM,
@@ -611,6 +615,9 @@ namespace llvm {
// extract_vector_elt, store.
VEXTRACT_STORE,
+ // scalar broadcast from memory
+ VBROADCAST_LOAD,
+
// Store FP control world into i16 memory.
FNSTCW16m,
@@ -680,6 +687,9 @@ namespace llvm {
bool isCalleePop(CallingConv::ID CallingConv,
bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
+ /// If Op is a constant whose elements are all the same constant or
+ /// undefined, return true and return the constant value in \p SplatVal.
+ bool isConstantSplat(SDValue Op, APInt &SplatVal);
} // end namespace X86
//===--------------------------------------------------------------------===//
@@ -792,6 +802,17 @@ namespace llvm {
/// and some i16 instructions are slow.
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+ /// Return 1 if we can compute the negated form of the specified expression
+ /// for the same cost as the expression itself, or 2 if we can compute the
+ /// negated form more cheaply than the expression itself. Else return 0.
+ char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
+ bool ForCodeSize, unsigned Depth) const override;
+
+ /// If isNegatibleForFree returns true, return the newly negated expression.
+ SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations, bool ForCodeSize,
+ unsigned Depth) const override;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
@@ -840,6 +861,13 @@ namespace llvm {
bool hasAndNot(SDValue Y) const override;
+ bool hasBitTest(SDValue X, SDValue Y) const override;
+
+ bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const override;
+
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;
@@ -863,11 +891,7 @@ namespace llvm {
return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
}
- bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
- if (DAG.getMachineFunction().getFunction().hasMinSize())
- return false;
- return true;
- }
+ bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
bool shouldSplatInsEltVarIndex(EVT VT) const override;
@@ -913,6 +937,10 @@ namespace llvm {
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const override;
+
const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
SDValue unwrapAddress(SDValue N) const override;
@@ -1090,11 +1118,12 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool reduceSelectOfFPConstantLoads(bool IsFPSetCC) const override;
+ bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
bool convertSelectOfConstantsToMath(EVT VT) const override;
- bool decomposeMulByConstant(EVT VT, SDValue C) const override;
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
bool IsSigned) const override;
@@ -1136,8 +1165,8 @@ namespace llvm {
return nullptr; // nothing to do, move along.
}
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
@@ -1189,12 +1218,18 @@ namespace llvm {
CallingConv::ID CC,
EVT VT) const override;
+ unsigned getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
bool supportSwiftError() const override;
StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+ unsigned getStackProbeSize(MachineFunction &MF) const;
+
bool hasVectorBlend() const override { return true; }
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
@@ -1326,6 +1361,12 @@ namespace llvm {
SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+ RTLIB::Libcall Call) const;
SDValue
LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1372,6 +1413,9 @@ namespace llvm {
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+ bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
+ bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
+
bool needsCmpXchgNb(Type *MemType) const;
void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
@@ -1462,6 +1506,9 @@ namespace llvm {
/// Reassociate floating point divisions into multiply by reciprocal.
unsigned combineRepeatedFPDivisors() const override;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
};
namespace X86 {
@@ -1625,24 +1672,24 @@ namespace llvm {
/// mask. This is the reverse process to canWidenShuffleElements, but can
/// always succeed.
template <typename T>
- void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
+ void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
SmallVectorImpl<T> &ScaledMask) {
assert(0 < Scale && "Unexpected scaling factor");
size_t NumElts = Mask.size();
ScaledMask.assign(NumElts * Scale, -1);
- for (int i = 0; i != (int)NumElts; ++i) {
+ for (size_t i = 0; i != NumElts; ++i) {
int M = Mask[i];
// Repeat sentinel values in every mask element.
if (M < 0) {
- for (int s = 0; s != Scale; ++s)
+ for (size_t s = 0; s != Scale; ++s)
ScaledMask[(Scale * i) + s] = M;
continue;
}
// Scale mask element and increment across each mask element.
- for (int s = 0; s != Scale; ++s)
+ for (size_t s = 0; s != Scale; ++s)
ScaledMask[(Scale * i) + s] = (Scale * M) + s;
}
}
diff --git a/lib/Target/X86/X86IndirectBranchTracking.cpp b/lib/Target/X86/X86IndirectBranchTracking.cpp
index 04e8b2231fec..cc0f59ab329d 100644
--- a/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -84,7 +84,7 @@ bool X86IndirectBranchTrackingPass::addENDBR(
return false;
}
-bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
+static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
if (!MOp.isGlobal())
return false;
auto *CalleeFn = dyn_cast<Function>(MOp.getGlobal());
diff --git a/lib/Target/X86/X86InsertPrefetch.cpp b/lib/Target/X86/X86InsertPrefetch.cpp
index 02ae73706a34..2b1e3f23efd7 100644
--- a/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/lib/Target/X86/X86InsertPrefetch.cpp
@@ -79,8 +79,8 @@ ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
// The prefetch instruction can't take memory operands involving vector
// registers.
bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
- unsigned BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
- unsigned IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
+ Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
+ Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
return (BaseReg == 0 ||
X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
@@ -108,7 +108,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
Prefetches &Prefetches) const {
assert(Prefetches.empty() &&
"Expected caller passed empty PrefetchInfo vector.");
- static const std::pair<const StringRef, unsigned> HintTypes[] = {
+ static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
{"_nta_", X86::PREFETCHNTA},
{"_t0_", X86::PREFETCHT0},
{"_t1_", X86::PREFETCHT1},
@@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) {
void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineModuleInfo>();
+ AU.addRequired<MachineModuleInfoWrapperPass>();
}
bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 54eddeacaa17..9b5de59430a5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -74,6 +74,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+ PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
!cast<ComplexPattern>("sse_load_f32"),
@@ -412,6 +413,14 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
}
+let Predicates = [HasAVX512] in {
+def : Pat<(v64i8 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v32i16 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+}
+
// Alias instructions that allow VPTERNLOG to be used with a mask to create
// a mix of all ones and all zeros elements. This is done this way to force
// the same register to be used as input for all three sources.
@@ -436,6 +445,19 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
[(set VR256X:$dst, (v8i32 immAllZerosV))]>;
}
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v4f32 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v32i8 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
+}
+
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -443,7 +465,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
[(set FR32X:$dst, fp32imm0)]>;
def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
- [(set FR64X:$dst, fpimm0)]>;
+ [(set FR64X:$dst, fp64imm0)]>;
+ def AVX512_FsFLD0F128 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
+ [(set VR128X:$dst, fp128imm0)]>;
}
//===----------------------------------------------------------------------===//
@@ -730,14 +754,14 @@ let isCommutable = 1 in
def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))]>,
+ timm:$src3))]>,
EVEX_4V, EVEX_CD8<32, CD8VT1>,
Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
}
@@ -1100,75 +1124,104 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
X86VectorVTInfo MaskInfo,
X86VectorVTInfo DestInfo,
X86VectorVTInfo SrcInfo,
- SDPatternOperator UnmaskedOp = X86VBroadcast> {
- let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in {
- defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo,
- (outs MaskInfo.RC:$dst),
- (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))),
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>,
- T8PD, EVEX, Sched<[SchedRR]>;
- let mayLoad = 1 in
- defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
- (outs MaskInfo.RC:$dst),
- (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT (UnmaskedOp
- (SrcInfo.ScalarLdFrag addr:$src))))),
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT (X86VBroadcast
- (SrcInfo.ScalarLdFrag addr:$src)))))>,
- T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
- Sched<[SchedRM]>;
- }
-
- def : Pat<(MaskInfo.VT
- (bitconvert
- (DestInfo.VT (UnmaskedOp
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src))))))),
- (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>;
- def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+ bit IsConvertibleToThreeAddress,
+ SDPatternOperator UnmaskedOp = X86VBroadcast,
+ SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> {
+ let hasSideEffects = 0 in
+ def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
+ DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+ def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
(bitconvert
(DestInfo.VT
- (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src)))))),
- MaskInfo.RC:$src0)),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#mk)
- MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
- def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+ let Constraints = "$src0 = $dst" in
+ def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+
+ let hasSideEffects = 0, mayLoad = 1 in
+ def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedBcastOp addr:$src)))))],
+ DestInfo.ExeDomain>, T8PD, EVEX,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
(bitconvert
(DestInfo.VT
- (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src)))))),
- MaskInfo.ImmAllZerosV)),
- (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz)
- MaskInfo.KRCWM:$mask, addr:$src)>;
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ let Constraints = "$src0 = $dst",
+ isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+ def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
}
// Helper class to force mask and broadcast result to same type.
multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name,
SchedWrite SchedRR, SchedWrite SchedRM,
X86VectorVTInfo DestInfo,
- X86VectorVTInfo SrcInfo> :
+ X86VectorVTInfo SrcInfo,
+ bit IsConvertibleToThreeAddress> :
avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM,
- DestInfo, DestInfo, SrcInfo>;
+ DestInfo, DestInfo, SrcInfo,
+ IsConvertibleToThreeAddress>;
multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info512, _.info128>,
+ WriteFShuffle256Ld, _.info512, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
_.info128>,
EVEX_V512;
@@ -1176,7 +1229,7 @@ multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
let Predicates = [HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info256, _.info128>,
+ WriteFShuffle256Ld, _.info256, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
_.info128>,
EVEX_V256;
@@ -1187,7 +1240,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info512, _.info128>,
+ WriteFShuffle256Ld, _.info512, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512,
_.info128>,
EVEX_V512;
@@ -1195,12 +1248,12 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
let Predicates = [HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info256, _.info128>,
+ WriteFShuffle256Ld, _.info256, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256,
_.info128>,
EVEX_V256;
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256,
- WriteFShuffle256Ld, _.info128, _.info128>,
+ WriteFShuffle256Ld, _.info128, _.info128, 1>,
avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128,
_.info128>,
EVEX_V128;
@@ -1284,46 +1337,35 @@ defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
X86VBroadcast, GR64, HasAVX512>, VEX_W;
-// Provide aliases for broadcast from the same register class that
-// automatically does the extract.
-multiclass avx512_int_broadcast_rm_lowering<string Name,
- X86VectorVTInfo DestInfo,
- X86VectorVTInfo SrcInfo,
- X86VectorVTInfo ExtInfo> {
- def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#"r")
- (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>;
-}
-
multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo _, Predicate prd> {
+ AVX512VLVectorVTInfo _, Predicate prd,
+ bit IsConvertibleToThreeAddress> {
let Predicates = [prd] in {
defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
- WriteShuffle256Ld, _.info512, _.info128>,
- avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>,
+ WriteShuffle256Ld, _.info512, _.info128,
+ IsConvertibleToThreeAddress>,
EVEX_V512;
- // Defined separately to avoid redefinition.
- defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>;
}
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256,
- WriteShuffle256Ld, _.info256, _.info128>,
- avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>,
+ WriteShuffle256Ld, _.info256, _.info128,
+ IsConvertibleToThreeAddress>,
EVEX_V256;
defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle,
- WriteShuffleXLd, _.info128, _.info128>,
+ WriteShuffleXLd, _.info128, _.info128,
+ IsConvertibleToThreeAddress>,
EVEX_V128;
}
}
defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb",
- avx512vl_i8_info, HasBWI>;
+ avx512vl_i8_info, HasBWI, 0>;
defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",
- avx512vl_i16_info, HasBWI>;
+ avx512vl_i16_info, HasBWI, 0>;
defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",
- avx512vl_i32_info, HasAVX512>;
+ avx512vl_i32_info, HasAVX512, 1>;
defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq",
- avx512vl_i64_info, HasAVX512>, VEX_W1X;
+ avx512vl_i64_info, HasAVX512, 1>, VEX_W1X;
multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
@@ -1354,6 +1396,10 @@ let Predicates = [HasAVX512] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZm addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDZm addr:$src)>;
}
let Predicates = [HasVLX] in {
@@ -1362,6 +1408,12 @@ let Predicates = [HasVLX] in {
(VPBROADCASTQZ128m addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDZ128m addr:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+ (VPBROADCASTDZ256m addr:$src)>;
}
let Predicates = [HasVLX, HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -1382,6 +1434,12 @@ let Predicates = [HasVLX, HasBWI] in {
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZ256m addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWZ128m addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWZ256m addr:$src)>;
}
let Predicates = [HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -1394,6 +1452,10 @@ let Predicates = [HasBWI] in {
def : Pat<(v32i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWZm addr:$src)>;
+
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWZm addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -1629,12 +1691,12 @@ multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI] in
defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info512,
- _Src.info512, _Src.info128, null_frag>,
+ _Src.info512, _Src.info128, 0, null_frag, null_frag>,
EVEX_V512;
let Predicates = [HasDQI, HasVLX] in
defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256,
WriteShuffle256Ld, _Dst.info256,
- _Src.info256, _Src.info128, null_frag>,
+ _Src.info256, _Src.info128, 0, null_frag, null_frag>,
EVEX_V256;
}
@@ -1645,7 +1707,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
let Predicates = [HasDQI, HasVLX] in
defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle,
WriteShuffleXLd, _Dst.info128,
- _Src.info128, _Src.info128, null_frag>,
+ _Src.info128, _Src.info128, 0, null_frag, null_frag>,
EVEX_V128;
}
@@ -1654,23 +1716,6 @@ defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
avx512vl_f32_info, avx512vl_f64_info>;
-let Predicates = [HasVLX] in {
-def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
- (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
-def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
- (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
-}
-
-def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
- (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>;
-def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
- (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>;
-
-def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
- (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>;
-def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))),
- (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>;
-
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST MASK TO VECTOR REGISTER
//---
@@ -1730,7 +1775,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src2,
- IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+ IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
AVX5128IBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1807,7 +1852,7 @@ multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(X86VPermt2 _.RC:$src2,
(IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
- (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ (_.BroadcastLdFrag addr:$src3)),
(_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
(!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3)>;
@@ -1846,7 +1891,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
- IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>,
+ IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
AVX5128IBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1947,7 +1992,7 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
}
multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let mayLoad = 1, hasSideEffects = 0 in {
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
@@ -2031,9 +2076,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
+ timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -2041,9 +2086,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc),
+ timm:$cc),
(OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+ timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -2052,9 +2097,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc",
(OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc),
+ timm:$cc),
(OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)>,
+ timm:$cc)>,
EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
let isCodeGenOnly = 1 in {
@@ -2065,7 +2110,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
- imm:$cc))]>,
+ timm:$cc))]>,
EVEX_4V, VEX_LIG, Sched<[sched]>;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
@@ -2074,7 +2119,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
- imm:$cc))]>,
+ timm:$cc))]>,
EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2100,94 +2145,82 @@ let Predicates = [HasAVX512] in {
SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
}
-multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86FoldableSchedWrite sched,
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, bit IsCommutable> {
- let isCommutable = IsCommutable in
+ let isCommutable = IsCommutable, hasSideEffects = 0 in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>,
- EVEX_4V, Sched<[sched]>;
+ []>, EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
def rm : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (_.VT (_.LdFrag addr:$src2))))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- let isCommutable = IsCommutable in
+ []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ let isCommutable = IsCommutable, hasSideEffects = 0 in
def rrk : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
- [(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>,
- EVEX_4V, EVEX_K, Sched<[sched]>;
+ []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+ let mayLoad = 1, hasSideEffects = 0 in
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
- [(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode_su (_.VT _.RC:$src1),
- (_.VT (_.LdFrag addr:$src2)))))]>,
- EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su,
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
bit IsCommutable> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched, _, IsCommutable> {
+ avx512_icmp_packed<opc, OpcodeStr, sched, _, IsCommutable> {
+ let mayLoad = 1, hasSideEffects = 0 in {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
"|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.KRC:$dst, (and _.KRCWM:$mask,
- (OpNode_su (_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))]>,
- EVEX_4V, EVEX_K, EVEX_B,
+ []>, EVEX_4V, EVEX_K, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
}
-multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86SchedWriteWidths sched,
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo, Predicate prd,
bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, PatFrag OpNode_su,
X86SchedWriteWidths sched,
AVX512VLVectorVTInfo VTInfo,
Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM,
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.ZMM,
VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM,
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.YMM,
VTInfo.info256, IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM,
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, sched.XMM,
VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
@@ -2195,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
// This fragment treats X86cmpm as commutable to help match loads in both
// operands for PCMPEQ.
def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>;
-def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
- (X86setcc_commute node:$src1, node:$src2, SETEQ)>;
def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
(setcc node:$src1, node:$src2, SETGT)>;
-def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2),
- (X86pcmpeqm_c node:$src1, node:$src2), [{
- return N->hasOneUse();
-}]>;
-def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2),
- (X86pcmpgtm node:$src1, node:$src2), [{
- return N->hasOneUse();
-}]>;
-
// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
// increase the pattern complexity the way an immediate would.
let AddedComplexity = 2 in {
// FIXME: Is there a better scheduler class for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb",
SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw",
SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd",
SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq",
SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb",
SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw",
SchedWriteVecALU, avx512vl_i16_info, HasBWI>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd",
SchedWriteVecALU, avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su,
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq",
SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
}
@@ -2322,8 +2344,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
"$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
[(set _.KRC:$dst, (_.KVT (Frag:$cc
(_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ (_.BroadcastLdFrag addr:$src2),
cond)))]>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
@@ -2335,23 +2356,21 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(_.KVT (Frag_su:$cc
(_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ (_.BroadcastLdFrag addr:$src2),
cond))))]>,
EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
- def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
(!cast<Instruction>(Name#_.ZSuffix#"rmib")
_.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
def : Pat<(and _.KRCWM:$mask,
- (_.KVT (CommFrag_su:$cc (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1), cond))),
(!cast<Instruction>(Name#_.ZSuffix#"rmibk")
_.KRCWM:$mask, _.RC:$src1, addr:$src2,
- (CommFrag.OperandTransform $cc))>;
+ (CommFrag_su.OperandTransform $cc))>;
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag,
@@ -2496,14 +2515,19 @@ def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc),
return N->hasOneUse();
}]>;
+def X86cmpm_imm_commute : SDNodeXForm<timm, [{
+ uint8_t Imm = X86::getSwappedVCMPImm(N->getZExtValue() & 0x1f);
+ return getI8Imm(Imm, SDLoc(N));
+}]>;
+
multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
- (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+ (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
1>, Sched<[sched]>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
@@ -2511,9 +2535,9 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
(X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
- imm:$cc),
+ timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
- imm:$cc)>,
+ timm:$cc)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
@@ -2523,38 +2547,37 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
"$cc, ${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr#", $cc",
(X86cmpm (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc)>,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc)>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
// Patterns for selecting with loads in other operand.
def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
- CommutableCMPCC:$cc),
+ timm:$cc),
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2),
(_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ timm:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
- def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1), CommutableCMPCC:$cc),
+ def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), timm:$cc),
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
- def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1),
- CommutableCMPCC:$cc)),
+ timm:$cc)),
(!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask,
_.RC:$src1, addr:$src2,
- imm:$cc)>;
+ (X86cmpm_imm_commute timm:$cc))>;
}
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
@@ -2564,9 +2587,9 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1",
"$src1, $src2, {sae}, $cc",
- (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc),
+ (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- imm:$cc)>,
+ timm:$cc)>,
EVEX_B, Sched<[sched]>;
}
@@ -2590,12 +2613,12 @@ defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
// Patterns to select fp compares with load as first operand.
let Predicates = [HasAVX512] in {
def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
- CommutableCMPCC:$cc)),
- (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>;
+ timm:$cc)),
+ (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
- CommutableCMPCC:$cc)),
- (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>;
+ timm:$cc)),
+ (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
}
// ----------------------------------------------------------------
@@ -2621,7 +2644,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
@@ -2629,7 +2652,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclasss_su (_.VT _.RC:$src1),
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
@@ -2637,7 +2660,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
(X86Vfpclasss _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
@@ -2645,7 +2668,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -2661,7 +2684,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
@@ -2669,7 +2692,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclass_su (_.VT _.RC:$src1),
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
@@ -2677,7 +2700,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclass
(_.VT (_.LdFrag addr:$src1)),
- (i32 imm:$src2)))]>,
+ (i32 timm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
@@ -2685,7 +2708,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (_.LdFrag addr:$src1)),
- (i32 imm:$src2))))]>,
+ (i32 timm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
@@ -2693,9 +2716,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
_.BroadcastStr##", $dst|$dst, ${src1}"
##_.BroadcastStr##", $src2}",
[(set _.KRC:$dst,(X86Vfpclass
- (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2)))]>,
+ (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2)))]>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
@@ -2703,9 +2725,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
_.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
_.BroadcastStr##", $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
- (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2))))]>,
+ (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))))]>,
EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2836,13 +2857,21 @@ def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
(KMOVWrk VK16:$src)>;
+def : Pat<(i64 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (SUBREG_TO_REG (i64 0), (KMOVWrk VK16:$src), sub_32bit)>;
def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
(COPY_TO_REGCLASS VK16:$src, GR32)>;
+def : Pat<(i64 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
+ (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK16:$src, GR32), sub_32bit)>;
def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
(KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
+def : Pat<(i64 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (SUBREG_TO_REG (i64 0), (KMOVBrk VK8:$src), sub_32bit)>, Requires<[HasDQI]>;
def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
(COPY_TO_REGCLASS VK8:$src, GR32)>;
+def : Pat<(i64 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
+ (INSERT_SUBREG (IMPLICIT_DEF), (COPY_TO_REGCLASS VK8:$src, GR32), sub_32bit)>;
def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
(COPY_TO_REGCLASS GR32:$src, VK32)>;
@@ -3075,7 +3104,7 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
!strconcat(OpcodeStr,
"\t{$imm, $src, $dst|$dst, $src, $imm}"),
- [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>,
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 timm:$imm)))]>,
Sched<[sched]>;
}
@@ -3098,30 +3127,6 @@ defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShu
defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>;
// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
-multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
- string InstStr,
- X86VectorVTInfo Narrow,
- X86VectorVTInfo Wide> {
- def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr#"Zrr")
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
- Narrow.KRC)>;
-
- def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (Frag_su (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2)))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr#"Zrrk")
- (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
- (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))),
- Narrow.KRC)>;
-}
-
-// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction.
multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
string InstStr,
X86VectorVTInfo Narrow,
@@ -3129,7 +3134,7 @@ multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
(Narrow.VT Narrow.RC:$src2), cond)),
(COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr##Zrri)
+ (!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
(Frag.OperandTransform $cc)), Narrow.KRC)>;
@@ -3138,53 +3143,111 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
(Narrow.KVT (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
(Narrow.VT Narrow.RC:$src2),
cond)))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- (Frag.OperandTransform $cc)), Narrow.KRC)>;
+ (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+}
+
+multiclass axv512_icmp_packed_cc_rmb_no_vlx_lowering<PatFrag Frag, PatFrag Frag_su,
+ PatFrag CommFrag, PatFrag CommFrag_su,
+ string InstStr,
+ X86VectorVTInfo Narrow,
+ X86VectorVTInfo Wide> {
+// Broadcast load.
+def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.BroadcastLdFrag addr:$src2), cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmib")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (Frag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT
+ (Frag_su:$cc (Narrow.VT Narrow.RC:$src1),
+ (Narrow.BroadcastLdFrag addr:$src2),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (Frag_su.OperandTransform $cc)), Narrow.KRC)>;
+
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (CommFrag:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmib")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (CommFrag.OperandTransform $cc)), Narrow.KRC)>;
+
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (Narrow.KVT
+ (CommFrag_su:$cc (Narrow.BroadcastLdFrag addr:$src2),
+ (Narrow.VT Narrow.RC:$src1),
+ cond)))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmibk")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (CommFrag_su.OperandTransform $cc)), Narrow.KRC)>;
}
// Same as above, but for fp types which don't use PatFrags.
-multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, PatFrag OpNode_su,
- string InstStr,
+multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), imm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc)),
(COPY_TO_REGCLASS
- (!cast<Instruction>(InstStr##Zrri)
+ (!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- imm:$cc), Narrow.KRC)>;
+ timm:$cc), Narrow.KRC)>;
def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
- (OpNode_su (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), imm:$cc))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrrik")
(COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)),
- imm:$cc), Narrow.KRC)>;
-}
+ timm:$cc), Narrow.KRC)>;
-let Predicates = [HasAVX512, NoVLX] in {
- // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
- // increase the pattern complexity the way an immediate would.
- let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v8i32x_info, v16i32_info>;
+// Broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmbi")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, timm:$cc), Narrow.KRC)>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQD", v4i32x_info, v16i32_info>;
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, timm:$cc), Narrow.KRC)>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+// Commuted with broadcast load.
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr#"Zrmbi")
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQQ", v2i64x_info, v8i64_info>;
- }
+def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
+ (X86cmpm_su (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbik")
+ (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC),
+ (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
+ addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
+}
+let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
@@ -3197,29 +3260,25 @@ let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPQ", v2i64x_info, v8i64_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v8f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPS", v4f32x_info, v16f32_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v4f64x_info, v8f64_info>;
- defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, X86cmpm_su, "VCMPPD", v2f64x_info, v8f64_info>;
-}
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v8i32x_info, v16i32_info>;
-let Predicates = [HasBWI, NoVLX] in {
- // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't
- // increase the pattern complexity the way an immediate would.
- let AddedComplexity = 2 in {
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpm, X86pcmpm_su, X86pcmpm_commute, X86pcmpm_commute_su, "VPCMPQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_cc_rmb_no_vlx_lowering<X86pcmpum, X86pcmpum_su, X86pcmpum_commute, X86pcmpum_commute_su, "VPCMPUQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, X86pcmpgtm_su, "VPCMPGTW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, X86pcmpeqm_c_su, "VPCMPEQW", v8i16x_info, v32i16_info>;
- }
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v8f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPS", v4f32x_info, v16f32_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v4f64x_info, v8f64_info>;
+ defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
+}
+let Predicates = [HasBWI, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
@@ -4186,16 +4245,32 @@ def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)),
(COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
(v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), (f32 FR32X:$src0))),
+ (COPY_TO_REGCLASS
+ (v4f32 (VMOVSSZrmk (v4f32 (COPY_TO_REGCLASS FR32X:$src0, VR128X)),
+ VK1WM:$mask, addr:$src)),
+ FR32X)>;
+def : Pat<(f32 (X86selects VK1WM:$mask, (loadf32 addr:$src), fp32imm0)),
+ (COPY_TO_REGCLASS (v4f32 (VMOVSSZrmkz VK1WM:$mask, addr:$src)), FR32X)>;
+
def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
(COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk
(v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)),
VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
(v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
-def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)),
+def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fp64imm0)),
(COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
(v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))),
+ (COPY_TO_REGCLASS
+ (v2f64 (VMOVSDZrmk (v2f64 (COPY_TO_REGCLASS FR64X:$src0, VR128X)),
+ VK1WM:$mask, addr:$src)),
+ FR64X)>;
+def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)),
+ (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>;
+
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
@@ -4537,8 +4612,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
+ (_.BroadcastLdFrag addr:$src2)))>,
AVX512BIBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4664,8 +4738,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
"${src2}"##_Brdct.BroadcastStr##", $src1",
"$src1, ${src2}"##_Brdct.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
- (_Brdct.VT (X86VBroadcast
- (_Brdct.ScalarLdFrag addr:$src2))))))>,
+ (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
AVX512BIBase, EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4737,8 +4810,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"${src2}"##_Src.BroadcastStr##", $src1",
"$src1, ${src2}"##_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
- (_Src.VT (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src2))))))>,
+ (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4874,22 +4946,11 @@ let Predicates = [HasDQI, NoVLX] in {
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
sub_ymm)>;
-
- def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
- (EXTRACT_SUBREG
- (VPMULLQZrr
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
- sub_xmm)>;
-}
-
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasDQI, NoVLX] in {
- def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
(EXTRACT_SUBREG
- (VPMULLQZrr
+ (VPMULLQZrmb
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ addr:$src2),
sub_ymm)>;
def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
@@ -4898,29 +4959,47 @@ let Predicates = [HasDQI, NoVLX] in {
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
sub_xmm)>;
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrmb
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ addr:$src2),
+ sub_xmm)>;
}
-multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> {
+multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)),
(EXTRACT_SUBREG
- (Instr
+ (!cast<Instruction>(Instr#"rr")
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
sub_ymm)>;
+ def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rmb")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ addr:$src2),
+ sub_ymm)>;
def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)),
(EXTRACT_SUBREG
- (Instr
+ (!cast<Instruction>(Instr#"rr")
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
(INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
sub_xmm)>;
+ def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(Instr#"rmb")
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ addr:$src2),
+ sub_xmm)>;
}
let Predicates = [HasAVX512, NoVLX] in {
- defm : avx512_min_max_lowering<VPMAXUQZrr, umax>;
- defm : avx512_min_max_lowering<VPMINUQZrr, umin>;
- defm : avx512_min_max_lowering<VPMAXSQZrr, smax>;
- defm : avx512_min_max_lowering<VPMINSQZrr, smin>;
+ defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
+ defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
+ defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
+ defm : avx512_min_max_lowering<"VPMINSQZ", smin>;
}
//===----------------------------------------------------------------------===//
@@ -4977,32 +5056,6 @@ let Predicates = [HasVLX] in {
def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)),
(VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(and VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDDZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(or VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPORDZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(xor VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPXORDZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR128X:$src1,
- (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>;
-
- def : Pat<(and VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDQZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(or VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPORQZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(xor VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPXORQZ128rmb VR128X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR128X:$src1,
- (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>;
-
def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
(VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
@@ -5042,32 +5095,6 @@ let Predicates = [HasVLX] in {
(VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)),
(VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
-
- def : Pat<(and VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDDZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(or VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPORDZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(xor VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPXORDZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR256X:$src1,
- (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>;
-
- def : Pat<(and VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDQZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(or VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPORQZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(xor VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPXORQZ256rmb VR256X:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR256X:$src1,
- (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>;
}
let Predicates = [HasAVX512] in {
@@ -5110,32 +5137,6 @@ let Predicates = [HasAVX512] in {
(VPANDNQZrm VR512:$src1, addr:$src2)>;
def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)),
(VPANDNQZrm VR512:$src1, addr:$src2)>;
-
- def : Pat<(and VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDDZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(or VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPORDZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(xor VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPXORDZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR512:$src1,
- (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
- (VPANDNDZrmb VR512:$src1, addr:$src2)>;
-
- def : Pat<(and VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDQZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(or VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPORQZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(xor VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPXORQZrmb VR512:$src1, addr:$src2)>;
- def : Pat<(X86andnp VR512:$src1,
- (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
- (VPANDNQZrmb VR512:$src1, addr:$src2)>;
}
// Patterns to catch vselect with different type than logic op.
@@ -5174,25 +5175,17 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
X86VectorVTInfo _,
X86VectorVTInfo IntInfo> {
// Register-broadcast logical operations.
- def : Pat<(IntInfo.VT (OpNode _.RC:$src1,
- (bitconvert (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))),
- (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(bitconvert
(IntInfo.VT (OpNode _.RC:$src1,
- (bitconvert (_.VT
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))))),
+ (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
_.RC:$src0)),
(!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
_.RC:$src1, addr:$src2)>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(bitconvert
(IntInfo.VT (OpNode _.RC:$src1,
- (bitconvert (_.VT
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))))),
+ (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
_.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask,
_.RC:$src1, addr:$src2)>;
@@ -5329,7 +5322,8 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
- X86FoldableSchedWrite sched, bit IsCommutable> {
+ X86FoldableSchedWrite sched, bit IsCommutable,
+ string EVEX2VexOvrd> {
let ExeDomain = _.ExeDomain in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -5349,7 +5343,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
- Sched<[sched]> {
+ Sched<[sched]>,
+ EVEX2VEXOverride<EVEX2VexOvrd#"rr"> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5357,7 +5352,8 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
}
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5387,10 +5383,12 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode VecNode, SDNode SaeNode,
X86SchedWriteSizes sched, bit IsCommutable> {
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
- VecNode, SaeNode, sched.PS.Scl, IsCommutable>,
+ VecNode, SaeNode, sched.PS.Scl, IsCommutable,
+ NAME#"SS">,
XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
- VecNode, SaeNode, sched.PD.Scl, IsCommutable>,
+ VecNode, SaeNode, sched.PD.Scl, IsCommutable,
+ NAME#"SD">,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds,
@@ -5410,13 +5408,14 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, SDNode OpNode,
- X86FoldableSchedWrite sched> {
+ X86FoldableSchedWrite sched,
+ string EVEX2VEXOvrd> {
let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>,
- Sched<[sched]> {
+ Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> {
let isCommutable = 1;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
@@ -5424,24 +5423,27 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>,
+ EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
}
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
- SchedWriteFCmp.Scl>, XS, EVEX_4V,
- VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMINCSS">, XS,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
- SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
- VEX_LIG, EVEX_CD8<64, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMINCSD">, XD,
+ VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
- SchedWriteFCmp.Scl>, XS, EVEX_4V,
- VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMAXCSS">, XS,
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
- SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V,
- VEX_LIG, EVEX_CD8<64, CD8VT1>;
+ SchedWriteFCmp.Scl, "VMAXCSD">, XD,
+ VEX_W, EVEX_4V, VEX_LIG,
+ EVEX_CD8<64, CD8VT1>;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
@@ -5464,8 +5466,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5595,8 +5596,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5751,13 +5751,13 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>,
+ (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>,
Sched<[sched]>;
defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
- (i8 imm:$src2)))>,
+ (i8 timm:$src2)))>,
Sched<[sched.Folded]>;
}
}
@@ -5769,7 +5769,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
- (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>,
+ (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>,
EVEX_B, Sched<[sched.Folded]>;
}
@@ -5911,17 +5911,17 @@ let Predicates = [HasAVX512, NoVLX] in {
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
VR128X:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
+ def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
- def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
+ timm:$src2)), sub_xmm)>;
}
//===-------------------------------------------------------------------===//
@@ -5953,8 +5953,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))>,
+ (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6062,27 +6061,27 @@ let Predicates = [HasAVX512, NoVLX] in {
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
- def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
- def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPROLDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPROLDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
}
// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
@@ -6113,27 +6112,27 @@ let Predicates = [HasAVX512, NoVLX] in {
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
- def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORQZri
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
- def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+ def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPRORDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- imm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+ timm:$src2)), sub_xmm)>;
+ def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))),
(EXTRACT_SUBREG (v16i32
(VPRORDZri
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- imm:$src2)), sub_ymm)>;
+ timm:$src2)), sub_ymm)>;
}
//===-------------------------------------------------------------------===//
@@ -6228,8 +6227,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode
_.RC:$src1,
- (Ctrl.VT (X86VBroadcast
- (Ctrl.ScalarLdFrag addr:$src2)))))>,
+ (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6419,7 +6417,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
- _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
+ _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6493,7 +6491,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src2,
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6571,7 +6569,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr,
- (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2)), 1, 0>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6964,7 +6962,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
- (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1)>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -7504,14 +7502,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
OpcodeStr,
"${src}"##Broadcast, "${src}"##Broadcast,
(_.VT (OpNode (_Src.VT
- (X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
+ (_Src.BroadcastLdFrag addr:$src))
)),
(vselect MaskRC:$mask,
(_.VT
(OpNode
(_Src.VT
- (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src))))),
+ (_Src.BroadcastLdFrag addr:$src)))),
_.RC:$src0),
vselect, "$src0 = $dst">,
EVEX, EVEX_B, Sched<[sched.Folded]>;
@@ -7646,14 +7643,14 @@ let Predicates = [HasAVX512] in {
v8f32x_info.ImmAllZerosV),
(VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>;
- def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2PSZrmb addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
- (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
(v8f32 VR256X:$src0)),
(VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>;
def : Pat<(vselect VK8WM:$mask,
- (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))),
+ (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
v8f32x_info.ImmAllZerosV),
(VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>;
}
@@ -7677,14 +7674,14 @@ let Predicates = [HasVLX] in {
v4f32x_info.ImmAllZerosV),
(VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2PSZ256rmb addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
- (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
VR128X:$src0),
(VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
def : Pat<(vselect VK4WM:$mask,
- (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))),
+ (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
v4f32x_info.ImmAllZerosV),
(VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>;
@@ -7708,12 +7705,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))),
+ def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
(VCVTPD2PSZ128rmb addr:$src)>;
- def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
(VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
v4f32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
@@ -8194,12 +8191,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2DQZ128rmb addr:$src)>;
- def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8223,12 +8220,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2DQZ128rmb addr:$src)>;
- def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8252,12 +8249,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTPD2UDQZ128rmb addr:$src)>;
- def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8281,12 +8278,12 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))),
+ def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2UDQZ128rmb addr:$src)>;
- def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
(VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
}
@@ -8419,12 +8416,12 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
(VCVTQQ2PSZ128rmb addr:$src)>;
- def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
(VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
v4f32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
@@ -8448,12 +8445,12 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
(VCVTUQQ2PSZ128rmb addr:$src)>;
- def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
(VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
v4f32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
@@ -8576,21 +8573,21 @@ let ExeDomain = GenericDomain in {
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>,
+ (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
Sched<[RR]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
+ (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
_dest.RC:$src0, _src.KRCWM:$mask))]>,
Sched<[RR]>, EVEX_K;
def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2),
+ (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2),
_dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
Sched<[RR]>, EVEX_KZ;
let hasSideEffects = 0, mayStore = 1 in {
@@ -8631,17 +8628,17 @@ let Predicates = [HasAVX512] in {
}
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
def : Pat<(store (i64 (extractelt
- (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
- (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
- def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
- (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>;
+ def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -8765,7 +8762,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ (_.BroadcastLdFrag addr:$src)))>,
EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -8859,7 +8856,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ (_.BroadcastLdFrag addr:$src)))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -8940,7 +8937,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(fsqrt (_.VT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+ (_.BroadcastLdFrag addr:$src)))>,
EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -9049,14 +9046,14 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3)))>,
+ (i32 timm:$src3)))>,
Sched<[sched]>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
(_.VT (X86RndScalesSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3)))>, EVEX_B,
+ (i32 timm:$src3)))>, EVEX_B,
Sched<[sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9064,7 +9061,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales _.RC:$src1,
- _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>,
+ _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -9082,15 +9079,15 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
}
let Predicates = [HasAVX512] in {
- def : Pat<(X86VRndScale _.FRC:$src1, imm:$src2),
+ def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2),
(_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
- _.FRC:$src1, imm:$src2))>;
+ _.FRC:$src1, timm:$src2))>;
}
let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), imm:$src2),
+ def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
(_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
- addr:$src1, imm:$src2))>;
+ addr:$src1, timm:$src2))>;
}
}
@@ -10109,19 +10106,19 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))>, Sched<[sched]>;
+ (i32 timm:$src2))>, Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2))>,
+ (i32 timm:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr##", $src2",
- (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2))>, EVEX_B,
+ (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10136,7 +10133,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))>,
+ (i32 timm:$src2))>,
EVEX_B, Sched<[sched]>;
}
@@ -10169,22 +10166,22 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3))>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i32 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10200,7 +10197,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT SrcInfo.RC:$src2),
- (i8 imm:$src3)))>,
+ (i8 timm:$src3)))>,
Sched<[sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
@@ -10208,7 +10205,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT (bitconvert
(SrcInfo.LdFrag addr:$src2))),
- (i8 imm:$src3)))>,
+ (i8 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10226,8 +10223,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3))>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i8 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -10241,15 +10238,14 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
Sched<[sched]>;
defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3))>,
+ (_.VT _.ScalarIntMemCPat:$src2),
+ (i32 timm:$src3))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10265,7 +10261,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
EVEX_B, Sched<[sched]>;
}
@@ -10279,7 +10275,7 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode
"$src1, $src2, {sae}, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3))>,
+ (i32 timm:$src3))>,
EVEX_B, Sched<[sched]>;
}
@@ -10401,7 +10397,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (bitconvert
(CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
- (i8 imm:$src3)))))>,
+ (i8 timm:$src3)))))>,
Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
@@ -10410,7 +10406,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
(bitconvert
(CastInfo.VT (X86Shuf128 _.RC:$src1,
(CastInfo.LdFrag addr:$src2),
- (i8 imm:$src3)))))>,
+ (i8 timm:$src3)))))>,
Sched<[sched.Folded, sched.ReadAfterFold]>,
EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -10421,8 +10417,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
(bitconvert
(CastInfo.VT
(X86Shuf128 _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (i8 imm:$src3)))))>, EVEX_B,
+ (_.BroadcastLdFrag addr:$src2),
+ (i8 timm:$src3)))))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10491,14 +10487,14 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>,
+ (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>,
Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86VAlign _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)),
- (i8 imm:$src3)))>,
+ (i8 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>,
EVEX2VEXOverride<"VPALIGNRrmi">;
@@ -10507,8 +10503,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(X86VAlign _.RC:$src1,
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3))>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ (i8 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10541,13 +10537,13 @@ defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr",
// Fragments to help convert valignq into masked valignd. Or valignq/valignd
// into vpalignr.
-def ValignqImm32XForm : SDNodeXForm<imm, [{
+def ValignqImm32XForm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
}]>;
-def ValignqImm8XForm : SDNodeXForm<imm, [{
+def ValignqImm8XForm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
}]>;
-def ValigndImm8XForm : SDNodeXForm<imm, [{
+def ValigndImm8XForm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
}]>;
@@ -10557,40 +10553,40 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1, From.RC:$src2,
- imm:$src3))),
+ timm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1, From.RC:$src2,
- imm:$src3))),
+ timm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(From.LdFrag addr:$src2),
- imm:$src3))),
+ timm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(From.LdFrag addr:$src2),
- imm:$src3))),
+ timm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
}
multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
@@ -10599,35 +10595,32 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
SDNodeXForm ImmXForm> :
avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
def : Pat<(From.VT (OpNode From.RC:$src1,
- (bitconvert (To.VT (X86VBroadcast
- (To.ScalarLdFrag addr:$src2)))),
- imm:$src3)),
+ (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3)),
(!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert
- (To.VT (X86VBroadcast
- (To.ScalarLdFrag addr:$src2)))),
- imm:$src3))),
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
def : Pat<(To.VT (vselect To.KRCWM:$mask,
(bitconvert
(From.VT (OpNode From.RC:$src1,
(bitconvert
- (To.VT (X86VBroadcast
- (To.ScalarLdFrag addr:$src2)))),
- imm:$src3))),
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
- (ImmXForm imm:$src3))>;
+ (ImmXForm timm:$src3))>;
}
let Predicates = [HasAVX512] in {
@@ -10666,13 +10659,13 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase,
+ (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
+ (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1)))))>,
EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded]>;
}
@@ -10685,8 +10678,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.ScalarMemOp:$src1), OpcodeStr,
"${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr,
- (_.VT (OpNode (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))))>,
+ (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>,
EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded]>;
}
@@ -10770,7 +10762,7 @@ let Predicates = [HasAVX512, NoVLX] in {
multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
AVX512VLVectorVTInfo _, Predicate prd> {
let Predicates = [prd, NoVLX] in {
- def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+ def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
@@ -10778,7 +10770,7 @@ multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
_.info256.SubRegIdx)),
_.info256.SubRegIdx)>;
- def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+ def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
(INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
@@ -10829,17 +10821,16 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup,
// AVX-512 - MOVDDUP
//===----------------------------------------------------------------------===//
-multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX,
+ (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src)))))>,
+ (_.VT (_.BroadcastLdFrag addr:$src))>,
EVEX, EVEX_CD8<_.EltSize, CD8VH>,
Sched<[sched.Folded]>;
}
@@ -10853,7 +10844,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasAVX512, HasVLX] in {
defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM,
VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM,
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, sched.XMM,
VTInfo.info128>, EVEX_V128;
}
}
@@ -10867,11 +10858,9 @@ multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>;
let Predicates = [HasVLX] in {
-def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
@@ -10884,17 +10873,17 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
immAllZerosV),
(VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
(v2f64 VR128X:$src0)),
(VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
immAllZerosV),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
@@ -11070,14 +11059,14 @@ multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
def rr : AVX512<opc, MRMr,
(outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>,
Sched<[sched]>;
def rm : AVX512<opc, MRMm,
(outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst,(_.VT (OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i8 imm:$src2))))]>,
+ (i8 timm:$src2))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -11104,6 +11093,7 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
string OpcodeStr, X86FoldableSchedWrite sched,
X86VectorVTInfo _dst, X86VectorVTInfo _src> {
+ let isCommutable = 1 in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -11140,7 +11130,7 @@ defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
// Transforms to swizzle an immediate to enable better matching when
// memory operand isn't in the right place.
-def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG321_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
uint8_t Imm = N->getZExtValue();
// Swap bits 1/4 and 3/6.
@@ -11151,7 +11141,7 @@ def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x40) NewImm |= 0x08;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG213_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
uint8_t Imm = N->getZExtValue();
// Swap bits 2/4 and 3/5.
@@ -11162,7 +11152,7 @@ def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x20) NewImm |= 0x08;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG132_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
uint8_t Imm = N->getZExtValue();
// Swap bits 1/2 and 5/6.
@@ -11173,7 +11163,7 @@ def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x40) NewImm |= 0x20;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG231_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by moving operand 1 to the end.
uint8_t Imm = N->getZExtValue();
// Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
@@ -11186,7 +11176,7 @@ def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
if (Imm & 0x40) NewImm |= 0x20;
return getI8Imm(NewImm, SDLoc(N));
}]>;
-def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
+def VPTERNLOG312_imm8 : SDNodeXForm<timm, [{
// Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
uint8_t Imm = N->getZExtValue();
// Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
@@ -11210,7 +11200,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT _.RC:$src3),
- (i8 imm:$src4)), 1, 1>,
+ (i8 timm:$src4)), 1, 1>,
AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
@@ -11218,7 +11208,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (bitconvert (_.LdFrag addr:$src3))),
- (i8 imm:$src4)), 1, 0>,
+ (i8 timm:$src4)), 1, 0>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -11227,146 +11217,145 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src2, ${src3}"##_.BroadcastStr##", $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- (i8 imm:$src4)), 1, 0>, EVEX_B,
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ (i8 timm:$src4)), 1, 0>, EVEX_B,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}// Constraints = "$src1 = $dst"
// Additional patterns for matching passthru operand in other positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
+ (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+ _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
// Additional patterns for matching loads in other positions.
def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (OpNode _.RC:$src1,
(bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4))),
+ _.RC:$src2, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching zero masking with loads in other
// positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ _.RC:$src2, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching masked loads with different
// operand orders.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
- (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
+ (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src1, (i8 imm:$src4)),
+ _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
- _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+ _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
// Additional patterns for matching broadcasts in other positions.
- def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+ def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (OpNode _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4))),
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4))),
(!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
- addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching zero masking with broadcasts in other
// positions.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
- (VPTERNLOG321_imm8 imm:$src4))>;
+ (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
- (VPTERNLOG132_imm8 imm:$src4))>;
+ (VPTERNLOG132_imm8 timm:$src4))>;
// Additional patterns for matching masked broadcasts with different
// operand orders.
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, (i8 imm:$src4)),
+ (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- (i8 imm:$src4)), _.RC:$src1)),
+ (_.BroadcastLdFrag addr:$src3),
+ (i8 timm:$src4)), _.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src1, (i8 imm:$src4)),
+ (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+ (OpNode (_.BroadcastLdFrag addr:$src3),
+ _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
- _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>;
}
multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched,
@@ -11387,6 +11376,113 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU,
defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
avx512vl_i64_info>, VEX_W;
+// Patterns to use VPTERNLOG for vXi16/vXi8 vectors.
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (loadv16i8 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i8 (X86vpternlog (loadv16i8 addr:$src3), VR128X:$src2,
+ VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, (loadv16i8 addr:$src3),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (loadv8i16 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i16 (X86vpternlog (loadv8i16 addr:$src3), VR128X:$src2,
+ VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, (loadv8i16 addr:$src3),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (loadv32i8 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i8 (X86vpternlog (loadv32i8 addr:$src3), VR256X:$src2,
+ VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, (loadv32i8 addr:$src3),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (loadv16i16 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i16 (X86vpternlog (loadv16i16 addr:$src3), VR256X:$src2,
+ VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, (loadv16i16 addr:$src3),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
+ timm:$src4)>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+ (loadv64i8 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v64i8 (X86vpternlog (loadv64i8 addr:$src3), VR512:$src2,
+ VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, (loadv64i8 addr:$src3),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+ (loadv32i16 addr:$src3), (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog (loadv32i16 addr:$src3), VR512:$src2,
+ VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+}
+
// Patterns to implement vnot using vpternlog instead of creating all ones
// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen
// so that the result is only dependent on src0. But we use the same source
@@ -11498,14 +11594,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT _.RC:$src3),
- (i32 imm:$src4))>, Sched<[sched]>;
+ (i32 timm:$src4))>, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
@@ -11513,8 +11609,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
"$src2, ${src3}"##_.BroadcastStr##", $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4))>,
+ (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
+ (i32 timm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // Constraints = "$src1 = $dst"
}
@@ -11531,7 +11627,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
(X86VFixupimmSAE (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT _.RC:$src3),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
EVEX_B, Sched<[sched]>;
}
}
@@ -11547,7 +11643,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(X86VFixupimms (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4))>, Sched<[sched]>;
+ (i32 timm:$src4))>, Sched<[sched]>;
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11555,7 +11651,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(X86VFixupimmSAEs (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
@@ -11564,13 +11660,13 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(_.VT _.RC:$src2),
(_src3VT.VT (scalar_to_vector
(_src3VT.ScalarLdFrag addr:$src3))),
- (i32 imm:$src4))>,
+ (i32 timm:$src4))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
- AVX512VLVectorVTInfo _Vec,
+ AVX512VLVectorVTInfo _Vec,
AVX512VLVectorVTInfo _Tbl> {
let Predicates = [HasAVX512] in
defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
@@ -11804,7 +11900,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
"${src3}"##VTI.BroadcastStr##", $src2",
"$src2, ${src3}"##VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>,
+ (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
AVX512FMA3Base, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -11880,12 +11976,14 @@ defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
let Constraints = "$src1 = $dst" in
multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
- X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
+ X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
+ bit IsCommutable> {
defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1,
- VTI.RC:$src2, VTI.RC:$src3))>,
+ VTI.RC:$src2, VTI.RC:$src3)),
+ IsCommutable, IsCommutable>,
EVEX_4V, T8PD, Sched<[sched]>;
defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
@@ -11899,27 +11997,58 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
"$src2, ${src3}"##VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
- (VTI.VT (X86VBroadcast
- (VTI.ScalarLdFrag addr:$src3))))>,
+ (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
- X86SchedWriteWidths sched> {
+ X86SchedWriteWidths sched, bit IsCommutable> {
let Predicates = [HasVNNI] in
- defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512;
+ defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info,
+ IsCommutable>, EVEX_V512;
let Predicates = [HasVNNI, HasVLX] in {
- defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256;
- defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128;
+ defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info,
+ IsCommutable>, EVEX_V256;
+ defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info,
+ IsCommutable>, EVEX_V128;
}
}
// FIXME: Is there a better scheduler class for VPDP?
-defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>;
-defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>;
-defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>;
-defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>;
+defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul, 0>;
+defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul, 0>;
+defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul, 1>;
+defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul, 1>;
+
+def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
+ (X86vpmaddwd node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+
+// Patterns to match VPDPWSSD from existing instructions/intrinsics.
+let Predicates = [HasVNNI] in {
+ def : Pat<(v16i32 (add VR512:$src1,
+ (X86vpmaddwd_su VR512:$src2, VR512:$src3))),
+ (VPDPWSSDZr VR512:$src1, VR512:$src2, VR512:$src3)>;
+ def : Pat<(v16i32 (add VR512:$src1,
+ (X86vpmaddwd_su VR512:$src2, (load addr:$src3)))),
+ (VPDPWSSDZm VR512:$src1, VR512:$src2, addr:$src3)>;
+}
+let Predicates = [HasVNNI,HasVLX] in {
+ def : Pat<(v8i32 (add VR256X:$src1,
+ (X86vpmaddwd_su VR256X:$src2, VR256X:$src3))),
+ (VPDPWSSDZ256r VR256X:$src1, VR256X:$src2, VR256X:$src3)>;
+ def : Pat<(v8i32 (add VR256X:$src1,
+ (X86vpmaddwd_su VR256X:$src2, (load addr:$src3)))),
+ (VPDPWSSDZ256m VR256X:$src1, VR256X:$src2, addr:$src3)>;
+ def : Pat<(v4i32 (add VR128X:$src1,
+ (X86vpmaddwd_su VR128X:$src2, VR128X:$src3))),
+ (VPDPWSSDZ128r VR128X:$src1, VR128X:$src2, VR128X:$src3)>;
+ def : Pat<(v4i32 (add VR128X:$src1,
+ (X86vpmaddwd_su VR128X:$src2, (load addr:$src3)))),
+ (VPDPWSSDZ128m VR128X:$src1, VR128X:$src2, addr:$src3)>;
+}
//===----------------------------------------------------------------------===//
// Bit Algorithms
@@ -12004,8 +12133,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1",
"$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
(OpNode (VTI.VT VTI.RC:$src1),
- (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))),
- (i8 imm:$src3))>, EVEX_B,
+ (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))),
+ (i8 timm:$src3))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -12116,7 +12245,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
!strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr,
", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
[(set _.KRPC:$dst, (X86vp2intersect
- _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>,
+ _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
}
@@ -12217,12 +12346,12 @@ let Predicates = [HasBF16, HasVLX] in {
(VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
- (X86VBroadcast (loadf32 addr:$src))))),
+ (X86VBroadcastld32 addr:$src)))),
(VCVTNEPS2BF16Z128rmb addr:$src)>;
- def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
(v8i16 VR128X:$src0), VK4WM:$mask),
(VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
- def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
+ def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)),
v8i16x_info.ImmAllZerosV, VK4WM:$mask),
(VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
}
@@ -12249,7 +12378,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr),
(_.VT (OpNode _.RC:$src1, _.RC:$src2,
- (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>,
+ (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
EVEX_B, EVEX_4V;
}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index e52635f8d48b..1e399a894490 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1271,22 +1271,22 @@ let isCompare = 1 in {
// ANDN Instruction
//
multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
- PatFrag ld_frag> {
+ PatFrag ld_frag, X86FoldableSchedWrite sched> {
def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
- Sched<[WriteALU]>;
+ Sched<[sched]>;
def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
(X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
- Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
}
// Complexity is reduced to give and with immediate a chance to match first.
let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in {
- defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V;
- defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W;
+ defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS, VEX_4V;
+ defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, VEX_4V, VEX_W;
}
let Predicates = [HasBMI], AddedComplexity = -6 in {
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 50aed98112c3..aa45e9b191c1 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -131,11 +131,11 @@ addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
/// reference.
static inline void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand,
unsigned Reg) {
- // Direct memory address is in a form of: Reg, 1 (Scale), NoReg, 0, NoReg.
- MI->getOperand(Operand).setReg(Reg);
+ // Direct memory address is in a form of: Reg/FI, 1 (Scale), NoReg, 0, NoReg.
+ MI->getOperand(Operand).ChangeToRegister(Reg, /*isDef=*/false);
MI->getOperand(Operand + 1).setImm(1);
MI->getOperand(Operand + 2).setReg(0);
- MI->getOperand(Operand + 3).setImm(0);
+ MI->getOperand(Operand + 3).ChangeToImmediate(0);
MI->getOperand(Operand + 4).setReg(0);
}
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index 099f6aa8d8bb..330b8c7a8a43 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -20,19 +20,19 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
: I<0x40, MRMSrcRegCC, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, ccode:$cond),
"cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst,
- (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>,
+ (X86cmov GR16:$src1, GR16:$src2, timm:$cond, EFLAGS))]>,
TB, OpSize16;
def CMOV32rr
: I<0x40, MRMSrcRegCC, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, ccode:$cond),
"cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst,
- (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>,
+ (X86cmov GR32:$src1, GR32:$src2, timm:$cond, EFLAGS))]>,
TB, OpSize32;
def CMOV64rr
:RI<0x40, MRMSrcRegCC, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, ccode:$cond),
"cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst,
- (X86cmov GR64:$src1, GR64:$src2, imm:$cond, EFLAGS))]>, TB;
+ (X86cmov GR64:$src1, GR64:$src2, timm:$cond, EFLAGS))]>, TB;
}
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
@@ -41,29 +41,46 @@ let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
: I<0x40, MRMSrcMemCC, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2, ccode:$cond),
"cmov${cond}{w}\t{$src2, $dst|$dst, $src2}",
[(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
- imm:$cond, EFLAGS))]>, TB, OpSize16;
+ timm:$cond, EFLAGS))]>, TB, OpSize16;
def CMOV32rm
: I<0x40, MRMSrcMemCC, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2, ccode:$cond),
"cmov${cond}{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
- imm:$cond, EFLAGS))]>, TB, OpSize32;
+ timm:$cond, EFLAGS))]>, TB, OpSize32;
def CMOV64rm
:RI<0x40, MRMSrcMemCC, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2, ccode:$cond),
"cmov${cond}{q}\t{$src2, $dst|$dst, $src2}",
[(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
- imm:$cond, EFLAGS))]>, TB;
+ timm:$cond, EFLAGS))]>, TB;
} // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
} // isCodeGenOnly = 1, ForceDisassemble = 1
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+ X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
+ return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
+ SDLoc(N), MVT::i8);
+}]>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+ (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+ (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+ (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+}
+
// SetCC instructions.
let Uses = [EFLAGS], isCodeGenOnly = 1, ForceDisassemble = 1 in {
def SETCCr : I<0x90, MRMXrCC, (outs GR8:$dst), (ins ccode:$cond),
"set${cond}\t$dst",
- [(set GR8:$dst, (X86setcc imm:$cond, EFLAGS))]>,
+ [(set GR8:$dst, (X86setcc timm:$cond, EFLAGS))]>,
TB, Sched<[WriteSETCC]>;
def SETCCm : I<0x90, MRMXmCC, (outs), (ins i8mem:$dst, ccode:$cond),
"set${cond}\t$dst",
- [(store (X86setcc imm:$cond, EFLAGS), addr:$dst)]>,
+ [(store (X86setcc timm:$cond, EFLAGS), addr:$dst)]>,
TB, Sched<[WriteSETCCStore]>;
} // Uses = [EFLAGS]
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index efaccdc9ee96..78d8dd3c0d03 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -542,7 +542,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
def CMOV#NAME : I<0, Pseudo,
(outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond),
"#CMOV_"#NAME#" PSEUDO!",
- [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond,
+ [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, timm:$cond,
EFLAGS)))]>;
}
@@ -593,66 +593,66 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _VK64 : CMOVrr_PSEUDO<VK64, v64i1>;
} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]
-def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
+def : Pat<(f128 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
let Predicates = [NoVLX] in {
- def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
- def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128 VR128:$t, VR128:$f, imm:$cond)>;
-
- def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
- def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256 VR256:$t, VR256:$f, imm:$cond)>;
+ def : Pat<(v16i8 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v8i16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+ def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>;
+
+ def : Pat<(v32i8 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v16i16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
+ def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>;
}
let Predicates = [HasVLX] in {
- def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
- def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR128X VR128X:$t, VR128X:$f, imm:$cond)>;
-
- def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
- def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, imm:$cond, EFLAGS)),
- (CMOV_VR256X VR256X:$t, VR256X:$f, imm:$cond)>;
+ def : Pat<(v16i8 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+ def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+
+ def : Pat<(v32i8 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+ def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
}
-def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
-def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)),
- (CMOV_VR512 VR512:$t, VR512:$f, imm:$cond)>;
+def : Pat<(v64i8 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
@@ -1126,12 +1126,12 @@ def : Pat<(f64 (bitconvert (i64 (atomic_load_64 addr:$src)))),
// binary size compared to a regular MOV, but it introduces an unnecessary
// load, so is not suitable for regular or optsize functions.
let Predicates = [OptForMinSize] in {
-def : Pat<(nonvolatile_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
-def : Pat<(nonvolatile_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
-def : Pat<(nonvolatile_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
-def : Pat<(nonvolatile_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>;
+def : Pat<(simple_store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>;
+def : Pat<(simple_store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>;
}
// In kernel code model, we can get the address of a label
@@ -1276,23 +1276,6 @@ def : Pat<(X86cmp GR32:$src1, 0),
def : Pat<(X86cmp GR64:$src1, 0),
(TEST64rr GR64:$src1, GR64:$src1)>;
-def inv_cond_XFORM : SDNodeXForm<imm, [{
- X86::CondCode CC = static_cast<X86::CondCode>(N->getZExtValue());
- return CurDAG->getTargetConstant(X86::GetOppositeBranchCondition(CC),
- SDLoc(N), MVT::i8);
-}]>;
-
-// Conditional moves with folded loads with operands swapped and conditions
-// inverted.
-let Predicates = [HasCMov] in {
- def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, imm:$cond, EFLAGS),
- (CMOV16rm GR16:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
- def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, imm:$cond, EFLAGS),
- (CMOV32rm GR32:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
- def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, imm:$cond, EFLAGS),
- (CMOV64rm GR64:$src2, addr:$src1, (inv_cond_XFORM imm:$cond))>;
-}
-
// zextload bool -> zextload byte
// i1 stored in one byte in zero-extended form.
// Upper bits cleanup should be executed before Store.
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index f82e80965b7c..e1e6eea59884 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -75,7 +75,7 @@ let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump],
def JCC_1 : Ii8PCRel <0x70, AddCCFrm, (outs),
(ins brtarget8:$dst, ccode:$cond),
"j${cond}\t$dst",
- [(X86brcond bb:$dst, imm:$cond, EFLAGS)]>;
+ [(X86brcond bb:$dst, timm:$cond, EFLAGS)]>;
let hasSideEffects = 0 in {
def JCC_2 : Ii16PCRel<0x80, AddCCFrm, (outs),
(ins brtarget16:$dst, ccode:$cond),
@@ -145,6 +145,17 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
[(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>,
Sched<[WriteJumpLd]>;
+ // Win64 wants indirect jumps leaving the function to have a REX_W prefix.
+ // These are switched from TAILJMPr/m64_REX in MCInstLower.
+ let isCodeGenOnly = 1, hasREX_WPrefix = 1 in {
+ def JMP64r_REX : I<0xFF, MRM4r, (outs), (ins GR64:$dst),
+ "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJump]>;
+ let mayLoad = 1 in
+ def JMP64m_REX : I<0xFF, MRM4m, (outs), (ins i64mem:$dst),
+ "rex64 jmp{q}\t{*}$dst", []>, Sched<[WriteJumpLd]>;
+
+ }
+
// Non-tracking jumps for IBT, use with caution.
let isCodeGenOnly = 1 in {
def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst",
@@ -273,39 +284,35 @@ let isCall = 1 in
// Tail call stuff.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [ESP, SSP] in {
- def TCRETURNdi : PseudoI<(outs),
- (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable;
- def TCRETURNri : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+ isCodeGenOnly = 1, Uses = [ESP, SSP] in {
+ def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
+ def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
let mayLoad = 1 in
- def TCRETURNmi : PseudoI<(outs),
- (ins i32mem_TC:$dst, i32imm:$offset), []>;
+ def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
+ []>, Sched<[WriteJumpLd]>;
- // FIXME: The should be pseudo instructions that are lowered when going to
- // mcinst.
- def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
- (ins i32imm_pcrel:$dst), "jmp\t$dst", []>;
+ def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst),
+ []>, Sched<[WriteJump]>;
- def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "", []>; // FIXME: Remove encoding when JIT is dead.
+ def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
let mayLoad = 1 in
- def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
- "jmp{l}\t{*}$dst", []>;
+ def TAILJMPm : PseudoI<(outs), (ins i32mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
}
// Conditional tail calls are similar to the above, but they are branches
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ isCodeGenOnly = 1, SchedRW = [WriteJump] in
let Uses = [ESP, EFLAGS, SSP] in {
def TCRETURNdicc : PseudoI<(outs),
(ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
// This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>;
+ def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>;
}
@@ -348,34 +355,36 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
+ isCodeGenOnly = 1, Uses = [RSP, SSP] in {
def TCRETURNdi64 : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset),
- []>;
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
def TCRETURNri64 : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>, NotMemoryFoldable;
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
- (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;
+ (ins i64mem_TC:$dst, i32imm:$offset),
+ []>, Sched<[WriteJumpLd]>, NotMemoryFoldable;
- def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
- "jmp\t$dst", []>;
+ def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst),
+ []>, Sched<[WriteJump]>;
- def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "jmp{q}\t{*}$dst", []>;
+ def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
let mayLoad = 1 in
- def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
- "jmp{q}\t{*}$dst", []>;
+ def TAILJMPm64 : PseudoI<(outs), (ins i64mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
let hasREX_WPrefix = 1 in {
- def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
- "rex64 jmp{q}\t{*}$dst", []>;
+ def TAILJMPr64_REX : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
+ []>, Sched<[WriteJump]>;
let mayLoad = 1 in
- def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
- "rex64 jmp{q}\t{*}$dst", []>;
+ def TAILJMPm64_REX : PseudoI<(outs), (ins i64mem_TC:$dst),
+ []>, Sched<[WriteJumpLd]>;
}
}
@@ -403,13 +412,13 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
// Conditional tail calls are similar to the above, but they are branches
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+ isCodeGenOnly = 1, SchedRW = [WriteJump] in
let Uses = [RSP, EFLAGS, SSP] in {
def TCRETURNdi64cc : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset,
i32imm:$cond), []>;
// This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>;
+ def TAILJMPd64_CC : PseudoI<(outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>;
}
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 06e605fe5db2..7a4eb138ec34 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -17,19 +17,18 @@ let hasSideEffects = 0 in {
let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
def CWDE : I<0x98, RawFrm, (outs), (ins),
"{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>;
+ let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
+ // FIXME: CWD/CDQ/CQO shouldn't Def the A register, but the fast register
+ // allocator crashes if you remove it.
let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
def CWD : I<0x99, RawFrm, (outs), (ins),
"{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>;
let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
def CDQ : I<0x99, RawFrm, (outs), (ins),
"{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>;
-
-
- let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
- def CDQE : RI<0x98, RawFrm, (outs), (ins),
- "{cltq|cdqe}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
-
let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
def CQO : RI<0x99, RawFrm, (outs), (ins),
"{cqto|cqo}", []>, Sched<[WriteALU]>, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp
index d42fec3770c7..f3b286e0375c 100644
--- a/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/lib/Target/X86/X86InstrFoldTables.cpp
@@ -292,6 +292,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::JMP32r_NT, X86::JMP32m_NT, TB_FOLDED_LOAD },
{ X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
{ X86::JMP64r_NT, X86::JMP64m_NT, TB_FOLDED_LOAD },
+ { X86::MMX_MOVD64from64rr, X86::MMX_MOVD64from64rm, TB_FOLDED_STORE | TB_NO_REVERSE },
+ { X86::MMX_MOVD64grr, X86::MMX_MOVD64mr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
{ X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
{ X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
@@ -5245,6 +5247,270 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
{ X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
};
+static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = {
+ { X86::VADDPDZ128rr, X86::VADDPDZ128rmb, TB_BCAST_SD },
+ { X86::VADDPDZ256rr, X86::VADDPDZ256rmb, TB_BCAST_SD },
+ { X86::VADDPDZrr, X86::VADDPDZrmb, TB_BCAST_SD },
+ { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS },
+ { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS },
+ { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS },
+ { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD },
+ { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD },
+ { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD },
+ { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS },
+ { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS },
+ { X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS },
+ { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD },
+ { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD },
+ { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD },
+ { X86::VDIVPSZ128rr, X86::VDIVPSZ128rmb, TB_BCAST_SS },
+ { X86::VDIVPSZ256rr, X86::VDIVPSZ256rmb, TB_BCAST_SS },
+ { X86::VDIVPSZrr, X86::VDIVPSZrmb, TB_BCAST_SS },
+ { X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rmb, TB_BCAST_SD },
+ { X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rmb, TB_BCAST_SD },
+ { X86::VMAXCPDZrr, X86::VMAXCPDZrmb, TB_BCAST_SD },
+ { X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rmb, TB_BCAST_SS },
+ { X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rmb, TB_BCAST_SS },
+ { X86::VMAXCPSZrr, X86::VMAXCPSZrmb, TB_BCAST_SS },
+ { X86::VMAXPDZ128rr, X86::VMAXPDZ128rmb, TB_BCAST_SD },
+ { X86::VMAXPDZ256rr, X86::VMAXPDZ256rmb, TB_BCAST_SD },
+ { X86::VMAXPDZrr, X86::VMAXPDZrmb, TB_BCAST_SD },
+ { X86::VMAXPSZ128rr, X86::VMAXPSZ128rmb, TB_BCAST_SS },
+ { X86::VMAXPSZ256rr, X86::VMAXPSZ256rmb, TB_BCAST_SS },
+ { X86::VMAXPSZrr, X86::VMAXPSZrmb, TB_BCAST_SS },
+ { X86::VMINCPDZ128rr, X86::VMINCPDZ128rmb, TB_BCAST_SD },
+ { X86::VMINCPDZ256rr, X86::VMINCPDZ256rmb, TB_BCAST_SD },
+ { X86::VMINCPDZrr, X86::VMINCPDZrmb, TB_BCAST_SD },
+ { X86::VMINCPSZ128rr, X86::VMINCPSZ128rmb, TB_BCAST_SS },
+ { X86::VMINCPSZ256rr, X86::VMINCPSZ256rmb, TB_BCAST_SS },
+ { X86::VMINCPSZrr, X86::VMINCPSZrmb, TB_BCAST_SS },
+ { X86::VMINPDZ128rr, X86::VMINPDZ128rmb, TB_BCAST_SD },
+ { X86::VMINPDZ256rr, X86::VMINPDZ256rmb, TB_BCAST_SD },
+ { X86::VMINPDZrr, X86::VMINPDZrmb, TB_BCAST_SD },
+ { X86::VMINPSZ128rr, X86::VMINPSZ128rmb, TB_BCAST_SS },
+ { X86::VMINPSZ256rr, X86::VMINPSZ256rmb, TB_BCAST_SS },
+ { X86::VMINPSZrr, X86::VMINPSZrmb, TB_BCAST_SS },
+ { X86::VMULPDZ128rr, X86::VMULPDZ128rmb, TB_BCAST_SD },
+ { X86::VMULPDZ256rr, X86::VMULPDZ256rmb, TB_BCAST_SD },
+ { X86::VMULPDZrr, X86::VMULPDZrmb, TB_BCAST_SD },
+ { X86::VMULPSZ128rr, X86::VMULPSZ128rmb, TB_BCAST_SS },
+ { X86::VMULPSZ256rr, X86::VMULPSZ256rmb, TB_BCAST_SS },
+ { X86::VMULPSZrr, X86::VMULPSZrmb, TB_BCAST_SS },
+ { X86::VPADDDZ128rr, X86::VPADDDZ128rmb, TB_BCAST_D },
+ { X86::VPADDDZ256rr, X86::VPADDDZ256rmb, TB_BCAST_D },
+ { X86::VPADDDZrr, X86::VPADDDZrmb, TB_BCAST_D },
+ { X86::VPADDQZ128rr, X86::VPADDQZ128rmb, TB_BCAST_Q },
+ { X86::VPADDQZ256rr, X86::VPADDQZ256rmb, TB_BCAST_Q },
+ { X86::VPADDQZrr, X86::VPADDQZrmb, TB_BCAST_Q },
+ { X86::VPANDDZ128rr, X86::VPANDDZ128rmb, TB_BCAST_D },
+ { X86::VPANDDZ256rr, X86::VPANDDZ256rmb, TB_BCAST_D },
+ { X86::VPANDDZrr, X86::VPANDDZrmb, TB_BCAST_D },
+ { X86::VPANDNDZ128rr, X86::VPANDNDZ128rmb, TB_BCAST_D },
+ { X86::VPANDNDZ256rr, X86::VPANDNDZ256rmb, TB_BCAST_D },
+ { X86::VPANDNDZrr, X86::VPANDNDZrmb, TB_BCAST_D },
+ { X86::VPANDNQZ128rr, X86::VPANDNQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDNQZ256rr, X86::VPANDNQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDNQZrr, X86::VPANDNQZrmb, TB_BCAST_Q },
+ { X86::VPANDQZ128rr, X86::VPANDQZ128rmb, TB_BCAST_Q },
+ { X86::VPANDQZ256rr, X86::VPANDQZ256rmb, TB_BCAST_Q },
+ { X86::VPANDQZrr, X86::VPANDQZrmb, TB_BCAST_Q },
+ { X86::VPCMPDZ128rri, X86::VPCMPDZ128rmib, TB_BCAST_D },
+ { X86::VPCMPDZ256rri, X86::VPCMPDZ256rmib, TB_BCAST_D },
+ { X86::VPCMPDZrri, X86::VPCMPDZrmib, TB_BCAST_D },
+ { X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rmb, TB_BCAST_D },
+ { X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rmb, TB_BCAST_D },
+ { X86::VPCMPEQDZrr, X86::VPCMPEQDZrmb, TB_BCAST_D },
+ { X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rmb, TB_BCAST_Q },
+ { X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rmb, TB_BCAST_Q },
+ { X86::VPCMPEQQZrr, X86::VPCMPEQQZrmb, TB_BCAST_Q },
+ { X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rmb, TB_BCAST_D },
+ { X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rmb, TB_BCAST_D },
+ { X86::VPCMPGTDZrr, X86::VPCMPGTDZrmb, TB_BCAST_D },
+ { X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rmb, TB_BCAST_Q },
+ { X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rmb, TB_BCAST_Q },
+ { X86::VPCMPGTQZrr, X86::VPCMPGTQZrmb, TB_BCAST_Q },
+ { X86::VPCMPQZ128rri, X86::VPCMPQZ128rmib, TB_BCAST_Q },
+ { X86::VPCMPQZ256rri, X86::VPCMPQZ256rmib, TB_BCAST_Q },
+ { X86::VPCMPQZrri, X86::VPCMPQZrmib, TB_BCAST_Q },
+ { X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmib, TB_BCAST_D },
+ { X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmib, TB_BCAST_D },
+ { X86::VPCMPUDZrri, X86::VPCMPUDZrmib, TB_BCAST_D },
+ { X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmib, TB_BCAST_Q },
+ { X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmib, TB_BCAST_Q },
+ { X86::VPCMPUQZrri, X86::VPCMPUQZrmib, TB_BCAST_Q },
+ { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rmb, TB_BCAST_D },
+ { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rmb, TB_BCAST_D },
+ { X86::VPMAXSDZrr, X86::VPMAXSDZrmb, TB_BCAST_D },
+ { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rmb, TB_BCAST_Q },
+ { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rmb, TB_BCAST_Q },
+ { X86::VPMAXSQZrr, X86::VPMAXSQZrmb, TB_BCAST_Q },
+ { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rmb, TB_BCAST_D },
+ { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rmb, TB_BCAST_D },
+ { X86::VPMAXUDZrr, X86::VPMAXUDZrmb, TB_BCAST_D },
+ { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rmb, TB_BCAST_Q },
+ { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rmb, TB_BCAST_Q },
+ { X86::VPMAXUQZrr, X86::VPMAXUQZrmb, TB_BCAST_Q },
+ { X86::VPMINSDZ128rr, X86::VPMINSDZ128rmb, TB_BCAST_D },
+ { X86::VPMINSDZ256rr, X86::VPMINSDZ256rmb, TB_BCAST_D },
+ { X86::VPMINSDZrr, X86::VPMINSDZrmb, TB_BCAST_D },
+ { X86::VPMINSQZ128rr, X86::VPMINSQZ128rmb, TB_BCAST_Q },
+ { X86::VPMINSQZ256rr, X86::VPMINSQZ256rmb, TB_BCAST_Q },
+ { X86::VPMINSQZrr, X86::VPMINSQZrmb, TB_BCAST_Q },
+ { X86::VPMINUDZ128rr, X86::VPMINUDZ128rmb, TB_BCAST_D },
+ { X86::VPMINUDZ256rr, X86::VPMINUDZ256rmb, TB_BCAST_D },
+ { X86::VPMINUDZrr, X86::VPMINUDZrmb, TB_BCAST_D },
+ { X86::VPMINUQZ128rr, X86::VPMINUQZ128rmb, TB_BCAST_Q },
+ { X86::VPMINUQZ256rr, X86::VPMINUQZ256rmb, TB_BCAST_Q },
+ { X86::VPMINUQZrr, X86::VPMINUQZrmb, TB_BCAST_Q },
+ { X86::VPMULLDZ128rr, X86::VPMULLDZ128rmb, TB_BCAST_D },
+ { X86::VPMULLDZ256rr, X86::VPMULLDZ256rmb, TB_BCAST_D },
+ { X86::VPMULLDZrr, X86::VPMULLDZrmb, TB_BCAST_D },
+ { X86::VPMULLQZ128rr, X86::VPMULLQZ128rmb, TB_BCAST_Q },
+ { X86::VPMULLQZ256rr, X86::VPMULLQZ256rmb, TB_BCAST_Q },
+ { X86::VPMULLQZrr, X86::VPMULLQZrmb, TB_BCAST_Q },
+ { X86::VPORDZ128rr, X86::VPORDZ128rmb, TB_BCAST_D },
+ { X86::VPORDZ256rr, X86::VPORDZ256rmb, TB_BCAST_D },
+ { X86::VPORDZrr, X86::VPORDZrmb, TB_BCAST_D },
+ { X86::VPORQZ128rr, X86::VPORQZ128rmb, TB_BCAST_Q },
+ { X86::VPORQZ256rr, X86::VPORQZ256rmb, TB_BCAST_Q },
+ { X86::VPORQZrr, X86::VPORQZrmb, TB_BCAST_Q },
+ { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
+ { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
+ { X86::VPTESTMDZrr, X86::VPTESTMDZrmb, TB_BCAST_D },
+ { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
+ { X86::VPTESTMQZrr, X86::VPTESTMQZrmb, TB_BCAST_Q },
+ { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
+ { X86::VPTESTNMDZrr, X86::VPTESTNMDZrmb, TB_BCAST_D },
+ { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
+ { X86::VPTESTNMQZrr, X86::VPTESTNMQZrmb, TB_BCAST_Q },
+ { X86::VPXORDZ128rr, X86::VPXORDZ128rmb, TB_BCAST_D },
+ { X86::VPXORDZ256rr, X86::VPXORDZ256rmb, TB_BCAST_D },
+ { X86::VPXORDZrr, X86::VPXORDZrmb, TB_BCAST_D },
+ { X86::VPXORQZ128rr, X86::VPXORQZ128rmb, TB_BCAST_Q },
+ { X86::VPXORQZ256rr, X86::VPXORQZ256rmb, TB_BCAST_Q },
+ { X86::VPXORQZrr, X86::VPXORQZrmb, TB_BCAST_Q },
+ { X86::VSUBPDZ128rr, X86::VSUBPDZ128rmb, TB_BCAST_SD },
+ { X86::VSUBPDZ256rr, X86::VSUBPDZ256rmb, TB_BCAST_SD },
+ { X86::VSUBPDZrr, X86::VSUBPDZrmb, TB_BCAST_SD },
+ { X86::VSUBPSZ128rr, X86::VSUBPSZ128rmb, TB_BCAST_SS },
+ { X86::VSUBPSZ256rr, X86::VSUBPSZ256rmb, TB_BCAST_SS },
+ { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS },
+};
+
+static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = {
+ { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD },
+ { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS },
+ { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD },
+ { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS },
+ { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD },
+ { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADD231PSZr, X86::VFMADD231PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUB231PSZr, X86::VFMSUB231PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZmb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZmb, TB_BCAST_SD },
+ { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS },
+ { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD },
+ { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS },
+ { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD },
+ { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD },
+ { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD },
+ { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS },
+ { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS },
+ { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS },
+};
+
static const X86MemoryFoldTableEntry *
lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
#ifndef NDEBUG
@@ -5287,6 +5553,18 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
std::end(MemoryFoldTable4)) ==
std::end(MemoryFoldTable4) &&
"MemoryFoldTable4 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable2),
+ std::end(BroadcastFoldTable2)) ==
+ std::end(BroadcastFoldTable2) &&
+ "BroadcastFoldTable2 is not sorted and unique!");
+ assert(std::is_sorted(std::begin(BroadcastFoldTable3),
+ std::end(BroadcastFoldTable3)) &&
+ std::adjacent_find(std::begin(BroadcastFoldTable3),
+ std::end(BroadcastFoldTable3)) ==
+ std::end(BroadcastFoldTable3) &&
+ "BroadcastFoldTable3 is not sorted and unique!");
FoldTablesChecked.store(true, std::memory_order_relaxed);
}
#endif
@@ -5355,6 +5633,15 @@ struct X86MemUnfoldTable {
// Index 4, folded load
addTableEntry(Entry, TB_INDEX_4 | TB_FOLDED_LOAD);
+ // Broadcast tables.
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable2)
+ // Index 2, folded broadcast
+ addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
+ for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
+ // Index 2, folded broadcast
+ addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
+
// Sort the memory->reg unfold table.
array_pod_sort(Table.begin(), Table.end());
diff --git a/lib/Target/X86/X86InstrFoldTables.h b/lib/Target/X86/X86InstrFoldTables.h
index 419baf98f61d..7dc236a0d7e4 100644
--- a/lib/Target/X86/X86InstrFoldTables.h
+++ b/lib/Target/X86/X86InstrFoldTables.h
@@ -19,35 +19,48 @@ namespace llvm {
enum {
// Select which memory operand is being unfolded.
- // (stored in bits 0 - 3)
+ // (stored in bits 0 - 2)
TB_INDEX_0 = 0,
TB_INDEX_1 = 1,
TB_INDEX_2 = 2,
TB_INDEX_3 = 3,
TB_INDEX_4 = 4,
- TB_INDEX_MASK = 0xf,
+ TB_INDEX_MASK = 0x7,
// Do not insert the reverse map (MemOp -> RegOp) into the table.
// This may be needed because there is a many -> one mapping.
- TB_NO_REVERSE = 1 << 4,
+ TB_NO_REVERSE = 1 << 3,
// Do not insert the forward map (RegOp -> MemOp) into the table.
// This is needed for Native Client, which prohibits branch
// instructions from using a memory operand.
- TB_NO_FORWARD = 1 << 5,
+ TB_NO_FORWARD = 1 << 4,
- TB_FOLDED_LOAD = 1 << 6,
- TB_FOLDED_STORE = 1 << 7,
+ TB_FOLDED_LOAD = 1 << 5,
+ TB_FOLDED_STORE = 1 << 6,
+ TB_FOLDED_BCAST = 1 << 7,
// Minimum alignment required for load/store.
- // Used for RegOp->MemOp conversion.
- // (stored in bits 8 - 15)
+ // Used for RegOp->MemOp conversion. Encoded as Log2(Align) + 1 to allow 0
+ // to mean align of 0.
+ // (stored in bits 8 - 11)
TB_ALIGN_SHIFT = 8,
- TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
- TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
- TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
- TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
- TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
+ TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
+ TB_ALIGN_16 = 5 << TB_ALIGN_SHIFT,
+ TB_ALIGN_32 = 6 << TB_ALIGN_SHIFT,
+ TB_ALIGN_64 = 7 << TB_ALIGN_SHIFT,
+ TB_ALIGN_MASK = 0xf << TB_ALIGN_SHIFT,
+
+ // Broadcast type.
+ // (stored in bits 12 - 13)
+ TB_BCAST_TYPE_SHIFT = 12,
+ TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT,
+ TB_BCAST_MASK = 0x3 << TB_BCAST_TYPE_SHIFT,
+
+ // Unused bits 14-15
};
// This struct is used for both the folding and unfold tables. They KeyOp
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 096cc27861ca..de6f8a81dff6 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -103,6 +103,8 @@ def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@@ -954,6 +956,26 @@ def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
+def X86VBroadcastld8 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 1;
+}]>;
+
+def X86VBroadcastld16 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2;
+}]>;
+
+def X86VBroadcastld32 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
+
+def X86VBroadcastld64 : PatFrag<(ops node:$src),
+ (X86VBroadcastld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
def fp32imm0 : PatLeaf<(f32 fpimm), [{
return N->isExactlyValue(+0.0);
@@ -963,6 +985,10 @@ def fp64imm0 : PatLeaf<(f64 fpimm), [{
return N->isExactlyValue(+0.0);
}]>;
+def fp128imm0 : PatLeaf<(f128 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
// to VEXTRACTF128/VEXTRACTI128 imm.
def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index dbe45356c42b..c29029daeec9 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -30,7 +30,7 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -465,7 +465,7 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
// Don't waste compile time scanning use-def chains of physregs.
- if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+ if (!Register::isVirtualRegister(BaseReg))
return false;
bool isPICBase = false;
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
@@ -480,9 +480,50 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
}
bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const {
+ AAResults *AA) const {
switch (MI.getOpcode()) {
- default: break;
+ default:
+ // This function should only be called for opcodes with the ReMaterializable
+ // flag set.
+ llvm_unreachable("Unknown rematerializable operation!");
+ break;
+
+ case X86::LOAD_STACK_GUARD:
+ case X86::AVX1_SETALLONES:
+ case X86::AVX2_SETALLONES:
+ case X86::AVX512_128_SET0:
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0:
+ case X86::AVX512_512_SETALLONES:
+ case X86::AVX512_FsFLD0SD:
+ case X86::AVX512_FsFLD0SS:
+ case X86::AVX512_FsFLD0F128:
+ case X86::AVX_SET0:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0SS:
+ case X86::FsFLD0F128:
+ case X86::KSET0D:
+ case X86::KSET0Q:
+ case X86::KSET0W:
+ case X86::KSET1D:
+ case X86::KSET1Q:
+ case X86::KSET1W:
+ case X86::MMX_SET0:
+ case X86::MOV32ImmSExti8:
+ case X86::MOV32r0:
+ case X86::MOV32r1:
+ case X86::MOV32r_1:
+ case X86::MOV32ri64:
+ case X86::MOV64ImmSExti8:
+ case X86::V_SET0:
+ case X86::V_SETALLONES:
+ case X86::MOV16ri:
+ case X86::MOV32ri:
+ case X86::MOV64ri:
+ case X86::MOV64ri32:
+ case X86::MOV8ri:
+ return true;
+
case X86::MOV8rm:
case X86::MOV8rm_NOREX:
case X86::MOV16rm:
@@ -561,7 +602,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
MI.isDereferenceableInvariantLoad(AA)) {
- unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
// Allow re-materialization of PIC load.
@@ -583,7 +624,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
// lea fi#, lea GV, etc. are all rematerializable.
if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
return true;
- unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
+ Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0)
return true;
// Allow re-materialization of lea PICBase + x.
@@ -594,10 +635,6 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
return false;
}
}
-
- // All other instructions marked M_REMATERIALIZABLE are always trivially
- // rematerializable.
- return true;
}
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
@@ -664,7 +701,7 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
}
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
- unsigned Opc, bool AllowSP, unsigned &NewSrc,
+ unsigned Opc, bool AllowSP, Register &NewSrc,
bool &isKill, MachineOperand &ImplicitOp,
LiveVariables *LV) const {
MachineFunction &MF = *MI.getParent()->getParent();
@@ -675,7 +712,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
RC = Opc != X86::LEA32r ?
&X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
}
- unsigned SrcReg = Src.getReg();
+ Register SrcReg = Src.getReg();
// For both LEA64 and LEA32 the register already has essentially the right
// type (32-bit or 64-bit) we may just need to forbid SP.
@@ -684,7 +721,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
isKill = Src.isKill();
assert(!Src.isUndef() && "Undef op doesn't need optimization");
- if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+ if (Register::isVirtualRegister(NewSrc) &&
!MF.getRegInfo().constrainRegClass(NewSrc, RC))
return false;
@@ -693,7 +730,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
// another we need to add 64-bit registers to the final MI.
- if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ if (Register::isPhysicalRegister(SrcReg)) {
ImplicitOp = Src;
ImplicitOp.setImplicit();
@@ -740,8 +777,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
return nullptr;
unsigned Opcode = X86::LEA64_32r;
- unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
- unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+ Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
// Build and insert into an implicit UNDEF value. This is OK because
// we will be shifting and then extracting the lower 8/16-bits.
@@ -751,8 +788,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
// But testing has shown this *does* help performance in 64-bit mode (at
// least on modern x86 machines).
MachineBasicBlock::iterator MBBI = MI.getIterator();
- unsigned Dest = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ Register Dest = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
bool IsDead = MI.getOperand(0).isDead();
bool IsKill = MI.getOperand(1).isKill();
unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
@@ -794,7 +831,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
case X86::ADD8rr_DB:
case X86::ADD16rr:
case X86::ADD16rr_DB: {
- unsigned Src2 = MI.getOperand(2).getReg();
+ Register Src2 = MI.getOperand(2).getReg();
bool IsKill2 = MI.getOperand(2).isKill();
assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
unsigned InRegLEA2 = 0;
@@ -888,7 +925,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
- if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+ if (Register::isVirtualRegister(Src.getReg()) &&
!MF.getRegInfo().constrainRegClass(Src.getReg(),
&X86::GR64_NOSPRegClass))
return nullptr;
@@ -911,7 +948,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
// LEA can't handle ESP.
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, ImplicitOp, LV))
@@ -947,7 +984,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
(Is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
ImplicitOp, LV))
@@ -970,7 +1007,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
: (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
ImplicitOp, LV))
@@ -1005,7 +1042,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, ImplicitOp, LV))
@@ -1013,7 +1050,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
const MachineOperand &Src2 = MI.getOperand(2);
bool isKill2;
- unsigned SrcReg2;
+ Register SrcReg2;
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
SrcReg2, isKill2, ImplicitOp2, LV))
@@ -1054,7 +1091,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, ImplicitOp, LV))
@@ -1085,6 +1122,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
return nullptr;
case X86::SUB32ri8:
case X86::SUB32ri: {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
int64_t Imm = MI.getOperand(2).getImm();
if (!isInt<32>(-Imm))
return nullptr;
@@ -1093,7 +1132,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill;
- unsigned SrcReg;
+ Register SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, ImplicitOp, LV))
@@ -1111,6 +1150,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::SUB64ri8:
case X86::SUB64ri32: {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
int64_t Imm = MI.getOperand(2).getImm();
if (!isInt<32>(-Imm))
return nullptr;
@@ -1140,40 +1181,62 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
- case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: {
+ case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
+ case X86::VBROADCASTSDZ256mk:
+ case X86::VBROADCASTSDZmk:
+ case X86::VBROADCASTSSZ128mk:
+ case X86::VBROADCASTSSZ256mk:
+ case X86::VBROADCASTSSZmk:
+ case X86::VPBROADCASTDZ128mk:
+ case X86::VPBROADCASTDZ256mk:
+ case X86::VPBROADCASTDZmk:
+ case X86::VPBROADCASTQZ128mk:
+ case X86::VPBROADCASTQZ256mk:
+ case X86::VPBROADCASTQZmk: {
unsigned Opc;
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
- case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
- case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
- case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
- case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
- case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
- case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
- case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
- case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
- case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
- case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
- case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
- case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
- case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
- case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
- case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
- case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
- case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
- case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
- case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
- case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
- case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
- case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
- case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
- case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
- case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
- case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
- case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
- case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
- case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
- case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
+ case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
+ case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
+ case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+ case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+ case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
+ case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break;
+ case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break;
+ case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break;
+ case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break;
+ case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break;
+ case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break;
+ case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break;
+ case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break;
+ case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break;
+ case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break;
+ case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break;
}
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1187,6 +1250,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
.add(MI.getOperand(7));
break;
}
+
case X86::VMOVDQU8Z128rrk:
case X86::VMOVDQU8Z256rrk:
case X86::VMOVDQU8Zrrk:
@@ -1683,6 +1747,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
+ case X86::VCMPSDZrr:
+ case X86::VCMPSSZrr:
+ case X86::VCMPPDZrri:
+ case X86::VCMPPSZrri:
+ case X86::VCMPPDZ128rri:
+ case X86::VCMPPSZ128rri:
+ case X86::VCMPPDZ256rri:
+ case X86::VCMPPSZ256rri:
+ case X86::VCMPPDZrrik:
+ case X86::VCMPPSZrrik:
+ case X86::VCMPPDZ128rrik:
+ case X86::VCMPPSZ128rrik:
+ case X86::VCMPPDZ256rrik:
+ case X86::VCMPPSZ256rrik: {
+ unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f;
+ Imm = X86::getSwappedVCMPImm(Imm);
+ auto &WorkingMI = cloneIfNew(MI);
+ WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
+ return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+ OpIdx1, OpIdx2);
+ }
case X86::VPERM2F128rr:
case X86::VPERM2I128rr: {
// Flip permute source immediate.
@@ -1859,7 +1944,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
- unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
+ Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
unsigned CommutableOpIdx1;
for (CommutableOpIdx1 = LastCommutableVecOp;
@@ -1889,7 +1974,8 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
return true;
}
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
const MCInstrDesc &Desc = MI.getDesc();
if (!Desc.isCommutable())
@@ -1926,17 +2012,23 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
switch (Imm) {
+ default:
+ // EVEX versions can be commuted.
+ if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
+ break;
+ return false;
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
- // The indices of the commutable operands are 1 and 2 (or 2 and 3
- // when masked).
- // Assign them to the returned operand indices here.
- return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
- 2 + OpOffset);
+ break;
}
- return false;
+
+ // The indices of the commutable operands are 1 and 2 (or 2 and 3
+ // when masked).
+ // Assign them to the returned operand indices here.
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
+ 2 + OpOffset);
}
case X86::MOVSSrr:
// X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
@@ -1990,6 +2082,24 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ case X86::VPDPWSSDZ128r:
+ case X86::VPDPWSSDZ128rk:
+ case X86::VPDPWSSDZ128rkz:
+ case X86::VPDPWSSDZ256r:
+ case X86::VPDPWSSDZ256rk:
+ case X86::VPDPWSSDZ256rkz:
+ case X86::VPDPWSSDZr:
+ case X86::VPDPWSSDZrk:
+ case X86::VPDPWSSDZrkz:
+ case X86::VPDPWSSDSZ128r:
+ case X86::VPDPWSSDSZ128rk:
+ case X86::VPDPWSSDSZ128rkz:
+ case X86::VPDPWSSDSZ256r:
+ case X86::VPDPWSSDSZ256rk:
+ case X86::VPDPWSSDSZ256rkz:
+ case X86::VPDPWSSDSZr:
+ case X86::VPDPWSSDSZrk:
+ case X86::VPDPWSSDSZrkz:
case X86::VPMADD52HUQZ128r:
case X86::VPMADD52HUQZ128rk:
case X86::VPMADD52HUQZ128rkz:
@@ -2215,7 +2325,7 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
}
}
-/// Get the VPCMP immediate if the opcodes are swapped.
+/// Get the VPCMP immediate if the operands are swapped.
unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
switch (Imm) {
default: llvm_unreachable("Unreachable!");
@@ -2233,7 +2343,7 @@ unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
return Imm;
}
-/// Get the VPCOM immediate if the opcodes are swapped.
+/// Get the VPCOM immediate if the operands are swapped.
unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
switch (Imm) {
default: llvm_unreachable("Unreachable!");
@@ -2251,6 +2361,23 @@ unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
return Imm;
}
+/// Get the VCMP immediate if the operands are swapped.
+unsigned X86::getSwappedVCMPImm(unsigned Imm) {
+ // Only need the lower 2 bits to distinquish.
+ switch (Imm & 0x3) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x00: case 0x03:
+ // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
+ break;
+ case 0x01: case 0x02:
+ // Need to toggle bits 3:0. Bit 4 stays the same.
+ Imm ^= 0xf;
+ break;
+ }
+
+ return Imm;
+}
+
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
if (!MI.isTerminator()) return false;
@@ -3131,25 +3258,6 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(isKill));
}
-void X86InstrInfo::storeRegToAddr(
- MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const {
- const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
- DebugLoc DL;
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.add(Addr[i]);
- MIB.addReg(SrcReg, getKillRegState(isKill));
- MIB.setMemRefs(MMOs);
- NewMIs.push_back(MIB);
-}
-
-
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned DestReg, int FrameIdx,
@@ -3164,23 +3272,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
}
-void X86InstrInfo::loadRegFromAddr(
- MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const {
- const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
- DebugLoc DL;
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
- for (unsigned i = 0, e = Addr.size(); i != e; ++i)
- MIB.add(Addr[i]);
- MIB.setMemRefs(MMOs);
- NewMIs.push_back(MIB);
-}
-
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &CmpMask,
int &CmpValue) const {
@@ -3599,8 +3690,9 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
if (!IsCmpZero && !Sub)
return false;
- bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
- Sub->getOperand(2).getReg() == SrcReg);
+ bool IsSwapped =
+ (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
+ Sub->getOperand(2).getReg() == SrcReg);
// Scan forward from the instruction after CmpInstr for uses of EFLAGS.
// It is safe to remove CmpInstr if EFLAGS is redefined or killed.
@@ -3755,7 +3847,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def.
@@ -3785,7 +3877,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
const MCInstrDesc &Desc) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
MIB->setDesc(Desc);
// MachineInstr::addOperand() will insert explicit operands before any
@@ -3815,7 +3907,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
bool MinusOne) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
// Insert the XOR.
BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
@@ -3891,7 +3983,7 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
const TargetInstrInfo &TII) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
auto Flags = MachineMemOperand::MOLoad |
@@ -3929,7 +4021,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
const MCInstrDesc &LoadDesc,
const MCInstrDesc &BroadcastDesc,
unsigned SubIdx) {
- unsigned DestReg = MIB->getOperand(0).getReg();
+ Register DestReg = MIB->getOperand(0).getReg();
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(DestReg) < 16) {
// We can use a normal VEX encoded load.
@@ -3952,7 +4044,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
const MCInstrDesc &StoreDesc,
const MCInstrDesc &ExtractDesc,
unsigned SubIdx) {
- unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(SrcReg) < 16) {
// We can use a normal VEX encoded store.
@@ -4008,12 +4100,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
+ case X86::FsFLD0F128:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
case X86::AVX_SET0: {
assert(HasAVX && "AVX not supported");
const TargetRegisterInfo *TRI = &getRegisterInfo();
- unsigned SrcReg = MIB->getOperand(0).getReg();
- unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ Register SrcReg = MIB->getOperand(0).getReg();
+ Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
MIB->getOperand(0).setReg(XReg);
Expand2AddrUndef(MIB, get(X86::VXORPSrr));
MIB.addReg(SrcReg, RegState::ImplicitDefine);
@@ -4021,9 +4114,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case X86::AVX512_128_SET0:
case X86::AVX512_FsFLD0SS:
- case X86::AVX512_FsFLD0SD: {
+ case X86::AVX512_FsFLD0SD:
+ case X86::AVX512_FsFLD0F128: {
bool HasVLX = Subtarget.hasVLX();
- unsigned SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB->getOperand(0).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
return Expand2AddrUndef(MIB,
@@ -4037,10 +4131,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0: {
bool HasVLX = Subtarget.hasVLX();
- unsigned SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB->getOperand(0).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
- unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
MIB->getOperand(0).setReg(XReg);
Expand2AddrUndef(MIB,
get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
@@ -4060,14 +4154,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
case X86::AVX1_SETALLONES: {
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
// VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
MIB->setDesc(get(X86::VCMPPSYrri));
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
return true;
}
case X86::AVX512_512_SETALLONES: {
- unsigned Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
@@ -4077,8 +4171,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case X86::AVX512_512_SEXT_MASK_32:
case X86::AVX512_512_SEXT_MASK_64: {
- unsigned Reg = MIB->getOperand(0).getReg();
- unsigned MaskReg = MIB->getOperand(1).getReg();
+ Register Reg = MIB->getOperand(0).getReg();
+ Register MaskReg = MIB->getOperand(1).getReg();
unsigned MaskState = getRegState(MIB->getOperand(1));
unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
@@ -4115,8 +4209,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::MOV32ri64: {
- unsigned Reg = MIB->getOperand(0).getReg();
- unsigned Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
+ Register Reg = MIB->getOperand(0).getReg();
+ Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
MI.setDesc(get(X86::MOV32ri));
MIB->getOperand(0).setReg(Reg32);
MIB.addReg(Reg, RegState::ImplicitDefine);
@@ -4251,8 +4345,8 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI.getOperand(0);
- unsigned Reg = MO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ Register Reg = MO.getReg();
+ if (Register::isVirtualRegister(Reg)) {
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
return 0;
} else {
@@ -4268,7 +4362,10 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// Return true for any instruction the copies the high bits of the first source
// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
+ bool ForLoadFold = false) {
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
switch (Opcode) {
case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
@@ -4427,6 +4524,14 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
case X86::VSQRTSDZm:
case X86::VSQRTSDZm_Int:
return true;
+ case X86::VMOVSSZrrk:
+ case X86::VMOVSDZrrk:
+ OpNum = 3;
+ return true;
+ case X86::VMOVSSZrrkz:
+ case X86::VMOVSDZrrkz:
+ OpNum = 2;
+ return true;
}
return false;
@@ -4449,14 +4554,11 @@ static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
unsigned
X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const {
- if (!hasUndefRegUpdate(MI.getOpcode()))
+ if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
return 0;
- // Set the OpNum parameter to the first source operand.
- OpNum = 1;
-
const MachineOperand &MO = MI.getOperand(OpNum);
- if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
return UndefRegClearance;
}
return 0;
@@ -4464,7 +4566,7 @@ X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
void X86InstrInfo::breakPartialRegDependency(
MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
- unsigned Reg = MI.getOperand(OpNum).getReg();
+ Register Reg = MI.getOperand(OpNum).getReg();
// If MI kills this register, the false dependence is already broken.
if (MI.killsRegister(Reg, TRI))
return;
@@ -4480,7 +4582,7 @@ void X86InstrInfo::breakPartialRegDependency(
} else if (X86::VR256RegClass.contains(Reg)) {
// Use vxorps to clear the full ymm register.
// It wants to read and write the xmm sub-register.
- unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
.addReg(XReg, RegState::Undef)
.addReg(XReg, RegState::Undef)
@@ -4489,7 +4591,7 @@ void X86InstrInfo::breakPartialRegDependency(
} else if (X86::GR64RegClass.contains(Reg)) {
// Using XOR32rr because it has shorter encoding and zeros up the upper bits
// as well.
- unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+ Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
.addReg(XReg, RegState::Undef)
.addReg(XReg, RegState::Undef)
@@ -4538,8 +4640,8 @@ static void updateOperandRegConstraints(MachineFunction &MF,
// We only need to update constraints on virtual register operands.
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
- if (!TRI.isVirtualRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
auto *NewRC = MRI.constrainRegClass(
@@ -4698,7 +4800,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
MachineInstr &MI) {
- if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) ||
+ unsigned Ignored;
+ if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
!MI.getOperand(1).isReg())
return false;
@@ -4788,6 +4891,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (I != nullptr) {
unsigned Opcode = I->DstOp;
unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+ MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0;
if (Align < MinAlign)
return nullptr;
bool NarrowToMOV32rm = false;
@@ -4821,8 +4925,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
// If this is the special case where we use a MOV32rm to load a 32-bit
// value and zero-extend the top bits. Change the destination register
// to a 32-bit one.
- unsigned DstReg = NewMI->getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ Register DstReg = NewMI->getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg))
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
@@ -5133,6 +5237,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX512_128_SET0:
+ case X86::FsFLD0F128:
+ case X86::AVX512_FsFLD0F128:
Alignment = 16;
break;
case X86::MMX_SET0:
@@ -5182,7 +5288,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
case X86::FsFLD0SS:
- case X86::AVX512_FsFLD0SS: {
+ case X86::AVX512_FsFLD0SS:
+ case X86::FsFLD0F128:
+ case X86::AVX512_FsFLD0F128: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
// Create a constant-pool entry and operands to load from it.
@@ -5212,6 +5320,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Ty = Type::getFloatTy(MF.getFunction().getContext());
else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
Ty = Type::getDoubleTy(MF.getFunction().getContext());
+ else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
+ Ty = Type::getFP128Ty(MF.getFunction().getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
@@ -5293,6 +5403,51 @@ extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
return StoreMMOs;
}
+static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
+ const TargetRegisterClass *RC,
+ const X86Subtarget &STI) {
+ assert(STI.hasAVX512() && "Expected at least AVX512!");
+ unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
+ assert((SpillSize == 64 || STI.hasVLX()) &&
+ "Can't broadcast less than 64 bytes without AVX512VL!");
+
+ switch (I->Flags & TB_BCAST_MASK) {
+ default: llvm_unreachable("Unexpected broadcast type!");
+ case TB_BCAST_D:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTDZ128m;
+ case 32: return X86::VPBROADCASTDZ256m;
+ case 64: return X86::VPBROADCASTDZm;
+ }
+ break;
+ case TB_BCAST_Q:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VPBROADCASTQZ128m;
+ case 32: return X86::VPBROADCASTQZ256m;
+ case 64: return X86::VPBROADCASTQZm;
+ }
+ break;
+ case TB_BCAST_SS:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VBROADCASTSSZ128m;
+ case 32: return X86::VBROADCASTSSZ256m;
+ case 64: return X86::VBROADCASTSSZm;
+ }
+ break;
+ case TB_BCAST_SD:
+ switch (SpillSize) {
+ default: llvm_unreachable("Unknown spill size");
+ case 16: return X86::VMOVDDUPZ128rm;
+ case 32: return X86::VBROADCASTSDZ256m;
+ case 64: return X86::VBROADCASTSDZm;
+ }
+ break;
+ }
+}
+
bool X86InstrInfo::unfoldMemoryOperand(
MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
@@ -5303,6 +5458,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
@@ -5311,7 +5467,9 @@ bool X86InstrInfo::unfoldMemoryOperand(
UnfoldStore &= FoldedStore;
const MCInstrDesc &MCID = get(Opc);
+
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+ const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
// TODO: Check if 32-byte or greater accesses are slow too?
if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
Subtarget.isUnalignedMem16Slow())
@@ -5335,10 +5493,26 @@ bool X86InstrInfo::unfoldMemoryOperand(
AfterOps.push_back(Op);
}
- // Emit the load instruction.
+ // Emit the load or broadcast instruction.
if (UnfoldLoad) {
auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
- loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs, NewMIs);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
+ }
+
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
+ for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+ MIB.add(AddrOps[i]);
+ MIB.setMemRefs(MMOs);
+ NewMIs.push_back(MIB);
+
if (UnfoldStore) {
// Address operands cannot be marked isKill.
for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
@@ -5404,7 +5578,16 @@ bool X86InstrInfo::unfoldMemoryOperand(
if (UnfoldStore) {
const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
- storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs, NewMIs);
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
+ MIB.add(AddrOps[i]);
+ MIB.addReg(Reg, RegState::Kill);
+ MIB.setMemRefs(MMOs);
+ NewMIs.push_back(MIB);
}
return true;
@@ -5423,6 +5606,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
unsigned Index = I->Flags & TB_INDEX_MASK;
bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
bool FoldedStore = I->Flags & TB_FOLDED_STORE;
+ bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
@@ -5456,10 +5640,17 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
- unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
- Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
- VT, MVT::Other, AddrOps);
+
+ unsigned Opc;
+ if (FoldedBCast) {
+ Opc = getBroadcastOpcode(I, RC, Subtarget);
+ } else {
+ unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
+ }
+
+ Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
NewNodes.push_back(Load);
// Preserve memory reference information.
@@ -7367,6 +7558,96 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
}
}
+Optional<ParamLoadedValue>
+X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const {
+ const MachineOperand *Op = nullptr;
+ DIExpression *Expr = nullptr;
+
+ switch (MI.getOpcode()) {
+ case X86::LEA32r:
+ case X86::LEA64r:
+ case X86::LEA64_32r: {
+ // Operand 4 could be global address. For now we do not support
+ // such situation.
+ if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
+ return None;
+
+ const MachineOperand &Op1 = MI.getOperand(1);
+ const MachineOperand &Op2 = MI.getOperand(3);
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister ||
+ Register::isPhysicalRegister(Op2.getReg())));
+
+ // Omit situations like:
+ // %rsi = lea %rsi, 4, ...
+ if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
+ Op2.getReg() == MI.getOperand(0).getReg())
+ return None;
+ else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
+ (Op2.getReg() != X86::NoRegister &&
+ TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
+ return None;
+
+ int64_t Coef = MI.getOperand(2).getImm();
+ int64_t Offset = MI.getOperand(4).getImm();
+ SmallVector<uint64_t, 8> Ops;
+
+ if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
+ Op = &Op1;
+ } else if (Op1.isFI())
+ Op = &Op1;
+
+ if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(Coef + 1);
+ Ops.push_back(dwarf::DW_OP_mul);
+ } else {
+ if (Op && Op2.getReg() != X86::NoRegister) {
+ int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
+ if (dwarfReg < 0)
+ return None;
+ else if (dwarfReg < 32) {
+ Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
+ Ops.push_back(0);
+ } else {
+ Ops.push_back(dwarf::DW_OP_bregx);
+ Ops.push_back(dwarfReg);
+ Ops.push_back(0);
+ }
+ } else if (!Op) {
+ assert(Op2.getReg() != X86::NoRegister);
+ Op = &Op2;
+ }
+
+ if (Coef > 1) {
+ assert(Op2.getReg() != X86::NoRegister);
+ Ops.push_back(dwarf::DW_OP_constu);
+ Ops.push_back(Coef);
+ Ops.push_back(dwarf::DW_OP_mul);
+ }
+
+ if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
+ Op2.getReg() != X86::NoRegister) {
+ Ops.push_back(dwarf::DW_OP_plus);
+ }
+ }
+
+ DIExpression::appendOffset(Ops, Offset);
+ Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
+
+ return ParamLoadedValue(*Op, Expr);;
+ }
+ case X86::XOR32rr: {
+ if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
+ return ParamLoadedValue(MachineOperand::CreateImm(0), Expr);
+ return None;
+ }
+ default:
+ return TargetInstrInfo::describeLoadedValue(MI);
+ }
+}
+
/// This is an architecture-specific helper function of reassociateOps.
/// Set special operand attributes for new instructions after reassociation.
void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
@@ -7500,9 +7781,8 @@ namespace {
// movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
// addq %rcx, %rax
// RAX now holds address of _GLOBAL_OFFSET_TABLE_.
- unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
- unsigned GOTReg =
- RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+ Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
.addReg(X86::RIP)
.addImm(0)
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 13ca17139494..22b7b1d4cb19 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -67,6 +67,9 @@ unsigned getSwappedVPCMPImm(unsigned Imm);
/// Get the VPCOM immediate if the opcodes are swapped.
unsigned getSwappedVPCOMImm(unsigned Imm);
+/// Get the VCMP immediate if the opcodes are swapped.
+unsigned getSwappedVCMPImm(unsigned Imm);
+
} // namespace X86
/// isGlobalStubReference - Return true if the specified TargetFlag operand is
@@ -203,7 +206,7 @@ public:
int &FrameIndex) const override;
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const override;
+ AAResults *AA) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
@@ -218,7 +221,7 @@ public:
/// Reference parameters are set to indicate how caller should add this
/// operand to the LEA instruction.
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
- unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
+ unsigned LEAOpcode, bool AllowSP, Register &NewSrc,
bool &isKill, MachineOperand &ImplicitOp,
LiveVariables *LV) const;
@@ -251,7 +254,7 @@ public:
/// findCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.
- bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
@@ -317,23 +320,11 @@ public:
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const;
-
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned DestReg,
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
- void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
- SmallVectorImpl<MachineOperand> &Addr,
- const TargetRegisterClass *RC,
- ArrayRef<MachineMemOperand *> MMOs,
- SmallVectorImpl<MachineInstr *> &NewMIs) const;
-
bool expandPostRAPseudo(MachineInstr &MI) const override;
/// Check whether the target can fold a load that feeds a subreg operand
@@ -527,6 +518,13 @@ public:
#define GET_INSTRINFO_HELPER_DECLS
#include "X86GenInstrInfo.inc"
+ static bool hasLockPrefix(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & X86II::LOCK;
+ }
+
+ Optional<ParamLoadedValue>
+ describeLoadedValue(const MachineInstr &MI) const override;
+
protected:
/// Commutes the operands in the given instruction by changing the operands
/// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8e05dd8ec5c1..e452145f3b65 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -673,6 +673,14 @@ def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
ImmSExti64i32AsmOperand];
}
+// 4-bit immediate used by some XOP instructions
+// [0, 0xF]
+def ImmUnsignedi4AsmOperand : AsmOperandClass {
+ let Name = "ImmUnsignedi4";
+ let RenderMethod = "addImmOperands";
+ let DiagnosticType = "InvalidImmUnsignedi4";
+}
+
// Unsigned immediate used by SSE/AVX instructions
// [0, 0xFF]
// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
@@ -705,6 +713,13 @@ def i64i8imm : Operand<i64> {
let OperandType = "OPERAND_IMMEDIATE";
}
+// Unsigned 4-bit immediate used by some XOP instructions.
+def u4imm : Operand<i8> {
+ let PrintMethod = "printU8Imm";
+ let ParserMatchClass = ImmUnsignedi4AsmOperand;
+ let OperandType = "OPERAND_IMMEDIATE";
+}
+
// Unsigned 8-bit immediate used by SSE/AVX instructions.
def u8imm : Operand<i8> {
let PrintMethod = "printU8Imm";
@@ -925,7 +940,6 @@ def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">;
def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
-def HasMPX : Predicate<"Subtarget->hasMPX()">;
def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
@@ -1103,7 +1117,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
if (ExtType == ISD::NON_EXTLOAD)
return true;
if (ExtType == ISD::EXTLOAD)
- return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return LD->getAlignment() >= 2 && LD->isSimple();
return false;
}]>;
@@ -1113,7 +1127,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
if (ExtType == ISD::NON_EXTLOAD)
return true;
if (ExtType == ISD::EXTLOAD)
- return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return LD->getAlignment() >= 4 && LD->isSimple();
return false;
}]>;
@@ -1170,7 +1184,7 @@ def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (unindexedload node:$ptr)), [
if (LD->getMemoryVT() == MVT::i32)
return true;
- return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return LD->getAlignment() >= 4 && LD->isSimple();
}]>;
@@ -2404,25 +2418,26 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
}
multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
- RegisterClass RC, X86MemOperand x86memop> {
+ RegisterClass RC, X86MemOperand x86memop,
+ X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in {
def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8PS, VEX_4V, Sched<[WriteBLS]>;
+ T8PS, VEX_4V, Sched<[sched]>;
let mayLoad = 1 in
def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8PS, VEX_4V, Sched<[WriteBLS.Folded]>;
+ T8PS, VEX_4V, Sched<[sched.Folded]>;
}
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
- defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>;
- defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W;
- defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>;
- defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W;
- defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>;
- defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W;
+ defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>;
+ defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, VEX_W;
+ defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>;
+ defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, VEX_W;
+ defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>;
+ defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, VEX_W;
}
//===----------------------------------------------------------------------===//
@@ -2683,12 +2698,12 @@ def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
multiclass lwpins_intr<RegisterClass RC> {
def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
+ [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
+ [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
XOP_4V, XOPA;
}
@@ -2700,11 +2715,11 @@ let Defs = [EFLAGS] in {
multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA;
+ [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>,
+ [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
XOP_4V, XOPA;
}
@@ -3205,13 +3220,13 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>;
// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
// Likewise for btc/btr/bts.
def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}",
- (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BT32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}",
- (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BTC32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}",
- (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BTR32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}",
- (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">;
+ (BTS32mi8 i32mem:$mem, i32u8imm:$imm), 0, "att">;
// clr aliases.
def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 57835b1a256a..cd9a866c91cb 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -30,7 +30,6 @@ def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
let Constraints = "$src1 = $dst" in {
// MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
- // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
X86FoldableSchedWrite sched, bit Commutable = 0,
X86MemOperand OType = i64mem> {
@@ -67,7 +66,7 @@ let Constraints = "$src1 = $dst" in {
def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
(ins VR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>,
+ [(set VR64:$dst, (IntId2 VR64:$src1, timm:$src2))]>,
Sched<[schedImm]>;
}
}
@@ -114,13 +113,13 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
(ins VR64:$src1, VR64:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 timm:$src3)))]>,
Sched<[sched]>;
def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+ (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -496,14 +495,14 @@ def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
(outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
- (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>,
+ (int_x86_sse_pshuf_w VR64:$src1, timm:$src2))]>,
Sched<[SchedWriteShuffle.MMX]>;
def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
(outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2),
"pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR64:$dst,
(int_x86_sse_pshuf_w (load_mmx addr:$src1),
- imm:$src2))]>,
+ timm:$src2))]>,
Sched<[SchedWriteShuffle.MMX.Folded]>;
// -- Conversion Instructions
@@ -535,7 +534,7 @@ def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg,
(outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2),
"pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
- imm:$src2))]>,
+ timm:$src2))]>,
Sched<[WriteVecExtract]>;
let Constraints = "$src1 = $dst" in {
let Predicates = [HasMMX, HasSSE1] in {
@@ -544,7 +543,7 @@ let Predicates = [HasMMX, HasSSE1] in {
(ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3),
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
- GR32orGR64:$src2, imm:$src3))]>,
+ GR32orGR64:$src2, timm:$src3))]>,
Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
@@ -553,7 +552,7 @@ let Predicates = [HasMMX, HasSSE1] in {
"pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
(i32 (anyext (loadi16 addr:$src2))),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
}
@@ -567,6 +566,13 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(int_x86_mmx_pmovmskb VR64:$src))]>,
Sched<[WriteMMXMOVMSK]>;
+// MMX to XMM for vector types
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+ [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
// Low word of XMM to MMX.
def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
[SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -574,9 +580,13 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
(x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
(x86mmx (MMX_MOVQ64rm addr:$src))>;
+def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
+ (i64 (bitconvert (x86mmx VR64:$src)))))),
+ (MMX_MOVQ2DQrr VR64:$src)>;
+
// Misc.
let SchedRW = [SchedWriteShuffle.MMX] in {
let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
@@ -602,9 +612,6 @@ def : Pat<(x86mmx (MMX_X86movdq2q
(bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))),
(MMX_CVTTPS2PIirr VR128:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
- (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
- (MMX_CVTTPS2PIirr VR128:$src)>;
-def : Pat<(x86mmx (MMX_X86movdq2q
(bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
(MMX_CVTPD2PIirr VR128:$src)>;
def : Pat<(x86mmx (MMX_X86movdq2q
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index f7d931510fe2..44ba071947c2 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -12,16 +12,16 @@
//
//===----------------------------------------------------------------------===//
-// FIXME: Investigate a better scheduler class once MPX is used inside LLVM.
+// FIXME: Investigate a better scheduler class if MPX is ever used inside LLVM.
let SchedRW = [WriteSystem] in {
multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
- Requires<[HasMPX, Not64BitMode]>;
+ Requires<[Not64BitMode]>;
def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
- Requires<[HasMPX, In64BitMode]>;
+ Requires<[In64BitMode]>;
}
defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
@@ -29,17 +29,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, Not64BitMode]>;
+ Requires<[Not64BitMode]>;
def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, In64BitMode]>;
+ Requires<[In64BitMode]>;
def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, Not64BitMode]>;
+ Requires<[Not64BitMode]>;
def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
- Requires<[HasMPX, In64BitMode]>;
+ Requires<[In64BitMode]>;
}
defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable;
defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable;
@@ -47,33 +47,31 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable;
def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX]>, NotMemoryFoldable;
+ NotMemoryFoldable;
let mayLoad = 1 in {
def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
+ Requires<[In64BitMode]>, NotMemoryFoldable;
}
let isCodeGenOnly = 1, ForceDisassemble = 1 in
def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX]>, NotMemoryFoldable;
+ NotMemoryFoldable;
let mayStore = 1 in {
def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable;
+ Requires<[Not64BitMode]>, NotMemoryFoldable;
def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
"bndmov\t{$src, $dst|$dst, $src}", []>, PD,
- Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable;
+ Requires<[In64BitMode]>, NotMemoryFoldable;
def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src),
- "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
- Requires<[HasMPX]>;
+ "bndstx\t{$src, $dst|$dst, $src}", []>, PS;
}
let mayLoad = 1 in
def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
- "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
- Requires<[HasMPX]>;
+ "bndldx\t{$src, $dst|$dst, $src}", []>, PS;
} // SchedRW
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 7d0a5b87baf4..09a04c0338b4 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -115,7 +115,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
- [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
+ [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
+ def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
}
//===----------------------------------------------------------------------===//
@@ -128,13 +130,18 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1, SchedRW = [WriteZero] in {
+ isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
[(set VR128:$dst, (v4f32 immAllZerosV))]>;
}
-let Predicates = [NoAVX512] in
+let Predicates = [NoAVX512] in {
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+}
// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
@@ -147,6 +154,14 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v8i32 immAllZerosV))]>;
}
+let Predicates = [NoAVX512] in {
+def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+}
+
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-ones value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -355,7 +370,7 @@ defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd
defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
SSEPackedSingle, SchedWriteFMoveLS.YMM>,
PS, VEX, VEX_L, VEX_WIG;
-defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
SSEPackedDouble, SchedWriteFMoveLS.YMM>,
PD, VEX, VEX_L, VEX_WIG;
}
@@ -661,7 +676,7 @@ let Predicates = [UseSSE1] in {
// This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
+ def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
(i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
@@ -727,7 +742,7 @@ let Predicates = [UseSSE1] in {
// This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
// end up with a movsd or blend instead of shufp.
// No need for aligned load, we're only loading 64-bits.
- def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
@@ -761,7 +776,7 @@ let Predicates = [UseSSE2] in {
let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
// Use MOVLPD to load into the low bits from a full vector unless we can use
// BLENDPD.
- def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
+ def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
}
@@ -1713,12 +1728,12 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
Sched<[sched]>;
def rm : SIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), imm:$cc))]>,
+ (ld_frag addr:$src2), timm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1751,13 +1766,13 @@ multiclass sse12_cmp_scalar_int<Operand memop,
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- VR128:$src, imm:$cc))]>,
+ VR128:$src, timm:$cc))]>,
Sched<[sched]>;
let mayLoad = 1 in
def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- mem_cpat:$src, imm:$cc))]>,
+ mem_cpat:$src, timm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1876,12 +1891,12 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
+ [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
Sched<[sched]>;
def rmi : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst,
- (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
+ (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1906,7 +1921,7 @@ let Constraints = "$src1 = $dst" in {
SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
}
-def CommutableCMPCC : PatLeaf<(imm), [{
+def CommutableCMPCC : PatLeaf<(timm), [{
uint64_t Imm = N->getZExtValue() & 0x7;
return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
}]>;
@@ -1915,47 +1930,47 @@ def CommutableCMPCC : PatLeaf<(imm), [{
let Predicates = [HasAVX] in {
def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
CommutableCMPCC:$cc)),
- (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
CommutableCMPCC:$cc)),
- (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
CommutableCMPCC:$cc)),
- (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+ (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
CommutableCMPCC:$cc)),
- (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+ (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
}
let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
CommutableCMPCC:$cc)),
- (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+ (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
}
let Predicates = [UseSSE1] in {
def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
CommutableCMPCC:$cc)),
- (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+ (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
CommutableCMPCC:$cc)),
- (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+ (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
}
//===----------------------------------------------------------------------===//
@@ -1970,13 +1985,13 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
- (i8 imm:$src3))))], d>,
+ (i8 timm:$src3))))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCommutable = IsCommutable in
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
- (i8 imm:$src3))))], d>,
+ (i8 timm:$src3))))], d>,
Sched<[sched]>;
}
@@ -2097,7 +2112,7 @@ let Predicates = [HasAVX1Only] in {
let Predicates = [UseSSE2] in {
// Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
- (v2f64 (nonvolatile_load addr:$src2)))),
+ (v2f64 (simple_load addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
}
@@ -2721,7 +2736,7 @@ defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64,
defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
-
+
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
/// represent the associated intrinsic operation. This form is unlike the
@@ -3482,7 +3497,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
+ [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
Sched<[schedImm]>;
}
@@ -3514,7 +3529,7 @@ multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+ [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
Sched<[sched]>;
}
@@ -3597,7 +3612,7 @@ let Predicates = [HasAVX, prd] in {
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+ (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
VEX, Sched<[sched.XMM]>, VEX_WIG;
def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
@@ -3605,7 +3620,7 @@ let Predicates = [HasAVX, prd] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (load addr:$src1),
- (i8 imm:$src2))))]>, VEX,
+ (i8 timm:$src2))))]>, VEX,
Sched<[sched.XMM.Folded]>, VEX_WIG;
}
@@ -3615,7 +3630,7 @@ let Predicates = [HasAVX2, prd] in {
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
- (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
+ (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src1, u8imm:$src2),
@@ -3623,7 +3638,7 @@ let Predicates = [HasAVX2, prd] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode (load addr:$src1),
- (i8 imm:$src2))))]>, VEX, VEX_L,
+ (i8 timm:$src2))))]>, VEX, VEX_L,
Sched<[sched.YMM.Folded]>, VEX_WIG;
}
@@ -3633,7 +3648,7 @@ let Predicates = [UseSSE2] in {
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
+ (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
Sched<[sched.XMM]>;
def mi : Ii8<0x70, MRMSrcMem,
(outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
@@ -3641,7 +3656,7 @@ let Predicates = [UseSSE2] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (memop addr:$src1),
- (i8 imm:$src2))))]>,
+ (i8 timm:$src2))))]>,
Sched<[sched.XMM.Folded]>;
}
}
@@ -4380,7 +4395,7 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
@@ -4388,7 +4403,7 @@ let Predicates = [HasAVX, NoVLX] in {
let Predicates = [UseSSE3] in {
// No need for aligned memory as this only loads 64-bits.
- def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
(MOVDDUPrm addr:$src)>;
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
@@ -4812,7 +4827,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
Sched<[sched]>;
let mayLoad = 1 in
def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
@@ -4823,7 +4838,7 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst, (VT (X86PAlignr RC:$src1,
(memop_frag addr:$src2),
- (i8 imm:$src3))))]>,
+ (i8 timm:$src3))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5300,7 +5315,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
+ (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
Sched<[SchedWriteFShuffle.XMM]>;
def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f32mem:$src2, u8imm:$src3),
@@ -5311,7 +5326,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
[(set VR128:$dst,
(X86insertps VR128:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
}
@@ -5323,17 +5338,6 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
}
-let Predicates = [UseAVX] in {
- // If we're inserting an element from a vbroadcast of a load, fold the
- // load into the X86insertps instruction.
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
- (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
- (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
- (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
- (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
-}
-
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
@@ -5348,7 +5352,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
+ [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
Sched<[sched]>;
// Vector intrinsic operation, mem
@@ -5357,13 +5361,13 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
+ (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
Sched<[sched.Folded]>;
}
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
-let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
@@ -5378,7 +5382,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
-let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
@@ -5396,7 +5400,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
-let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -5411,7 +5415,7 @@ let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
-let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -5431,7 +5435,7 @@ multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched,
ValueType VT32, ValueType VT64,
SDNode OpNode, bit Is2Addr = 1> {
-let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedSingle in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -5439,7 +5443,7 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
"ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
+ [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
Sched<[sched]>;
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
@@ -5450,11 +5454,11 @@ let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
-let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
+let ExeDomain = SSEPackedDouble in {
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -5462,7 +5466,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
"sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
+ [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
Sched<[sched]>;
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
@@ -5473,7 +5477,7 @@ let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
@@ -5508,17 +5512,17 @@ let Predicates = [UseAVX] in {
}
let Predicates = [UseAVX] in {
- def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
- (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
- (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+ (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+ (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
- (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
- (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+ (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+ (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
}
let ExeDomain = SSEPackedSingle in
@@ -5535,17 +5539,17 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
- def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
- (ROUNDSSr FR32:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
- (ROUNDSDr FR64:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+ (ROUNDSSr FR32:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+ (ROUNDSDr FR64:$src1, timm:$src2)>;
}
let Predicates = [UseSSE41, OptForSize] in {
- def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
- (ROUNDSSm addr:$src1, imm:$src2)>;
- def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
- (ROUNDSDm addr:$src1, imm:$src2)>;
+ def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+ (ROUNDSSm addr:$src1, timm:$src2)>;
+ def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+ (ROUNDSDm addr:$src1, timm:$src2)>;
}
//===----------------------------------------------------------------------===//
@@ -5826,7 +5830,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5836,7 +5840,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
- (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
+ (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5853,7 +5857,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -5863,27 +5867,27 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-def BlendCommuteImm2 : SDNodeXForm<imm, [{
+def BlendCommuteImm2 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue() & 0x03;
return getI8Imm(Imm ^ 0x03, SDLoc(N));
}]>;
-def BlendCommuteImm4 : SDNodeXForm<imm, [{
+def BlendCommuteImm4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue() & 0x0f;
return getI8Imm(Imm ^ 0x0f, SDLoc(N));
}]>;
-def BlendCommuteImm8 : SDNodeXForm<imm, [{
+def BlendCommuteImm8 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue() & 0xff;
return getI8Imm(Imm ^ 0xff, SDLoc(N));
}]>;
// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
-def BlendScaleImm4 : SDNodeXForm<imm, [{
+def BlendScaleImm4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 4; ++i) {
@@ -5894,7 +5898,7 @@ def BlendScaleImm4 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
-def BlendScaleImm2 : SDNodeXForm<imm, [{
+def BlendScaleImm2 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -5905,7 +5909,7 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
-def BlendScaleImm2to4 : SDNodeXForm<imm, [{
+def BlendScaleImm2to4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -5916,7 +5920,7 @@ def BlendScaleImm2to4 : SDNodeXForm<imm, [{
}]>;
// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
-def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 4; ++i) {
@@ -5927,7 +5931,7 @@ def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
-def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -5938,7 +5942,7 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
}]>;
// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
-def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
+def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
@@ -6008,7 +6012,7 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
Sched<[sched]>;
def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
@@ -6018,14 +6022,14 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
+ (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
// Pattern to commute if load is in first source.
- def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
+ def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
(!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
- (commuteXForm imm:$src3))>;
+ (commuteXForm timm:$src3))>;
}
let Predicates = [HasAVX] in {
@@ -6061,37 +6065,37 @@ let Predicates = [HasAVX2] in {
// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
// ExecutionDomainFixPass will cleanup domains later on.
let Predicates = [HasAVX1Only] in {
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
- (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+ (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+ (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
// it from becoming movsd via commuting under optsize.
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
-
-def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
- (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
-def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
-def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
- (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
+
+def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
+ (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
+def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
+def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
+ (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
// it from becoming movss via commuting under optsize.
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+ (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
}
defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
@@ -6107,19 +6111,19 @@ defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
let Predicates = [UseSSE41] in {
// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
// it from becoming movss via commuting under optsize.
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
-def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
+def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
-def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
- (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
- (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
+ (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
+ (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
}
// For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6592,7 +6596,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
- (i8 imm:$src3)))]>, TA,
+ (i8 timm:$src3)))]>, TA,
Sched<[SchedWriteVecIMul.XMM]>;
def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6600,7 +6604,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1,
(memop addr:$src2),
- (i8 imm:$src3)))]>, TA,
+ (i8 timm:$src3)))]>, TA,
Sched<[SchedWriteVecIMul.XMM.Folded,
SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -6718,26 +6722,26 @@ let Predicates = [HasAVX, HasAES] in {
(ins VR128:$src1, u8imm:$src2),
"vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
"vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
}
def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, u8imm:$src2),
"aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
Sched<[WriteAESKeyGen]>;
def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
"aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
+ (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
Sched<[WriteAESKeyGen.Folded]>;
//===----------------------------------------------------------------------===//
@@ -6745,7 +6749,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
//===----------------------------------------------------------------------===//
// Immediate transform to help with commuting.
-def PCLMULCommuteImm : SDNodeXForm<imm, [{
+def PCLMULCommuteImm : SDNodeXForm<timm, [{
uint8_t Imm = N->getZExtValue();
return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
}]>;
@@ -6758,7 +6762,7 @@ let Predicates = [NoAVX, HasPCLMUL] in {
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
- (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
Sched<[WriteCLMul]>;
def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
@@ -6766,14 +6770,14 @@ let Predicates = [NoAVX, HasPCLMUL] in {
"pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
- imm:$src3))]>,
+ timm:$src3))]>,
Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
} // Constraints = "$src1 = $dst"
def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
- (i8 imm:$src3)),
+ (i8 timm:$src3)),
(PCLMULQDQrm VR128:$src1, addr:$src2,
- (PCLMULCommuteImm imm:$src3))>;
+ (PCLMULCommuteImm timm:$src3))>;
} // Predicates = [NoAVX, HasPCLMUL]
// SSE aliases
@@ -6795,21 +6799,21 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
(ins RC:$src1, RC:$src2, u8imm:$src3),
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set RC:$dst,
- (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ (IntId RC:$src1, RC:$src2, timm:$src3))]>,
Sched<[WriteCLMul]>;
def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, MemOp:$src2, u8imm:$src3),
"vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set RC:$dst,
- (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
+ (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
// We can commute a load in the first operand by swapping the sources and
// rotating the immediate.
- def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
+ def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
(!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
- (PCLMULCommuteImm imm:$src3))>;
+ (PCLMULCommuteImm timm:$src3))>;
}
let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
@@ -6853,8 +6857,8 @@ let Constraints = "$src = $dst" in {
def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
(ins VR128:$src, u8imm:$len, u8imm:$idx),
"extrq\t{$idx, $len, $src|$src, $len, $idx}",
- [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
- imm:$idx))]>,
+ [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
+ timm:$idx))]>,
PD, Sched<[SchedWriteVecALU.XMM]>;
def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
@@ -6867,7 +6871,7 @@ def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
"insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
[(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
- imm:$len, imm:$idx))]>,
+ timm:$len, timm:$idx))]>,
XD, Sched<[SchedWriteVecALU.XMM]>;
def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
@@ -6907,10 +6911,10 @@ def : Pat<(nontemporalstore FR64:$src, addr:$dst),
//
class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType VT,
- PatFrag ld_frag, SchedWrite Sched> :
+ PatFrag bcast_frag, SchedWrite Sched> :
AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+ [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
Sched<[Sched]>, VEX;
// AVX2 adds register forms
@@ -6923,15 +6927,15 @@ class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
- f32mem, v4f32, loadf32,
+ f32mem, v4f32, X86VBroadcastld32,
SchedWriteFShuffle.XMM.Folded>;
def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
- f32mem, v8f32, loadf32,
+ f32mem, v8f32, X86VBroadcastld32,
SchedWriteFShuffle.XMM.Folded>, VEX_L;
}
let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
- v4f64, loadf64,
+ v4f64, X86VBroadcastld64,
SchedWriteFShuffle.XMM.Folded>, VEX_L;
let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
@@ -6944,15 +6948,6 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (VBROADCASTSSrm addr:$src)>;
- def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
- (VBROADCASTSSYrm addr:$src)>;
- def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
- (VBROADCASTSDYrm addr:$src)>;
-}
-
//===----------------------------------------------------------------------===//
// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
// halves of a 256-bit vector.
@@ -7081,27 +7076,29 @@ let Predicates = [HasAVX1Only] in {
//
multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
Intrinsic IntLd, Intrinsic IntLd256,
- Intrinsic IntSt, Intrinsic IntSt256> {
+ Intrinsic IntSt, Intrinsic IntSt256,
+ X86SchedWriteMaskMove schedX,
+ X86SchedWriteMaskMove schedY> {
def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
- VEX_4V, Sched<[WriteFMaskedLoad]>;
+ VEX_4V, Sched<[schedX.RM]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
+ VEX_4V, VEX_L, Sched<[schedY.RM]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
- VEX_4V, Sched<[WriteFMaskedStore]>;
+ VEX_4V, Sched<[schedX.MR]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
- VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
+ VEX_4V, VEX_L, Sched<[schedY.MR]>;
}
let ExeDomain = SSEPackedSingle in
@@ -7109,13 +7106,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
int_x86_avx_maskload_ps,
int_x86_avx_maskload_ps_256,
int_x86_avx_maskstore_ps,
- int_x86_avx_maskstore_ps_256>;
+ int_x86_avx_maskstore_ps_256,
+ WriteFMaskMove32, WriteFMaskMove32Y>;
let ExeDomain = SSEPackedDouble in
defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
int_x86_avx_maskload_pd,
int_x86_avx_maskload_pd_256,
int_x86_avx_maskstore_pd,
- int_x86_avx_maskstore_pd_256>;
+ int_x86_avx_maskstore_pd_256,
+ WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
@@ -7143,13 +7142,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
+ [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
Sched<[sched]>;
def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
(ins x86memop_f:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
+ (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
Sched<[sched.Folded]>;
}// Predicates = [HasAVX, NoVLX]
}
@@ -7181,38 +7180,38 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
- (i8 imm:$src3))))]>, VEX_4V, VEX_L,
+ (i8 timm:$src3))))]>, VEX_4V, VEX_L,
Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
- (i8 imm:$src3)))]>, VEX_4V, VEX_L,
+ (i8 timm:$src3)))]>, VEX_4V, VEX_L,
Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
}
// Immediate transform to help with commuting.
-def Perm2XCommuteImm : SDNodeXForm<imm, [{
+def Perm2XCommuteImm : SDNodeXForm<timm, [{
return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
}]>;
let Predicates = [HasAVX] in {
// Pattern with load in other operand.
def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
- VR256:$src1, (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+ VR256:$src1, (i8 timm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
+ (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
- (loadv4i64 addr:$src2), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+ (loadv4i64 addr:$src2), (i8 timm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
// Pattern with load in other operand.
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
- VR256:$src1, (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+ VR256:$src1, (i8 timm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
}
//===----------------------------------------------------------------------===//
@@ -7257,7 +7256,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
+ [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
TAPD, VEX, Sched<[RR]>;
let hasSideEffects = 0, mayStore = 1 in
def mr : Ii8<0x1D, MRMDestMem, (outs),
@@ -7282,15 +7281,15 @@ let Predicates = [HasF16C, NoVLX] in {
(VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
def : Pat<(store (i64 (extractelt
- (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
- (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -7327,20 +7326,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins RC:$src1, RC:$src2, u8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
Sched<[sched]>, VEX_4V;
def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
+ (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
// Pattern to commute if load is in first source.
- def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
+ def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
(!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
- (commuteXForm imm:$src3))>;
+ (commuteXForm timm:$src3))>;
}
let Predicates = [HasAVX2] in {
@@ -7351,19 +7350,19 @@ defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
SchedWriteBlend.YMM, VR256, i256mem,
BlendCommuteImm8>, VEX_L;
-def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
- (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
- (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
- (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
+def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
+ (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
+ (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
-def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
- (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
-def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
- (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
-def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
- (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
+def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
+ (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
+def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
+ (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
}
// For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7407,7 +7406,7 @@ def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0
// destination operand
//
multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
- X86MemOperand x86memop, PatFrag ld_frag,
+ X86MemOperand x86memop, PatFrag bcast_frag,
ValueType OpVT128, ValueType OpVT256, Predicate prd> {
let Predicates = [HasAVX2, prd] in {
def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -7418,7 +7417,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
+ (OpVT128 (bcast_frag addr:$src)))]>,
Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -7428,7 +7427,7 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
- (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
+ (OpVT256 (bcast_frag addr:$src)))]>,
Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
// Provide aliases for broadcast from the same register class that
@@ -7439,13 +7438,13 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
}
}
-defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
v16i8, v32i8, NoVLX_Or_NoBWI>;
-defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
v8i16, v16i16, NoVLX_Or_NoBWI>;
-defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
v4i32, v8i32, NoVLX>;
-defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
v2i64, v4i64, NoVLX>;
let Predicates = [HasAVX2, NoVLX] in {
@@ -7455,14 +7454,11 @@ let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
- def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ // FIXME this is to handle aligned extloads from i8/i16.
+ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
(VPBROADCASTDrm addr:$src)>;
- def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
(VPBROADCASTDYrm addr:$src)>;
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
- (VPBROADCASTQYrm addr:$src)>;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
@@ -7483,17 +7479,12 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(VPBROADCASTWYrm addr:$src)>;
-}
-let Predicates = [HasAVX2, NoVLX] in {
- // Provide aliases for broadcast from the same register class that
- // automatically does the extract.
- def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
- (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
- sub_xmm)))>;
- def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
- (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
- sub_xmm)))>;
+ // FIXME this is to handle aligned extloads from i8.
+ def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWrm addr:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+ (VPBROADCASTWYrm addr:$src)>;
}
let Predicates = [HasAVX2, NoVLX] in {
@@ -7509,45 +7500,41 @@ let Predicates = [HasAVX2, NoVLX] in {
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
- (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
+ (VPBROADCASTBrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit)),
- VR128)))>;
+ GR8:$src, sub_8bit))))>;
def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
- (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
+ (VPBROADCASTBYrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR8:$src, sub_8bit)),
- VR128)))>;
+ GR8:$src, sub_8bit))))>;
def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
- (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
+ (VPBROADCASTWrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR16:$src, sub_16bit)),
- VR128)))>;
+ GR16:$src, sub_16bit))))>;
def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
- (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
+ (VPBROADCASTWYrr (VMOVDI2PDIrr
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
- GR16:$src, sub_16bit)),
- VR128)))>;
+ GR16:$src, sub_16bit))))>;
}
let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
- (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
+ (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
- (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
+ (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
- (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
+ (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
- (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
+ (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
}
// AVX1 broadcast patterns
let Predicates = [HasAVX1Only] in {
-def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
(VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
(VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
(VBROADCASTSSrm addr:$src)>;
}
@@ -7557,12 +7544,12 @@ let Predicates = [HasAVX, NoVLX] in {
// 128bit broadcasts:
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
- def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
(VMOVDDUPrm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
(VMOVDDUPrr VR128:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
(VMOVDDUPrm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPrm addr:$src)>;
@@ -7581,19 +7568,19 @@ let Predicates = [HasAVX1Only] in {
(v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
- (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
+ (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
(VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
+ (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
+ (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
- (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
+ (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
+ (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
def : Pat<(v2i64 (X86VBroadcast i64:$src)),
- (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
- def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
+ def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
(VMOVDDUPrm addr:$src)>;
}
@@ -7636,7 +7623,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
- (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+ (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
Sched<[Sched]>, VEX, VEX_L;
def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
(ins memOp:$src1, u8imm:$src2),
@@ -7644,7 +7631,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(OpVT (X86VPermi (mem_frag addr:$src1),
- (i8 imm:$src2))))]>,
+ (i8 timm:$src2))))]>,
Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
}
}
@@ -7663,19 +7650,19 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
- (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
+ (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
VEX_4V, VEX_L;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
- (i8 imm:$src3)))]>,
+ (i8 timm:$src3)))]>,
Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
let Predicates = [HasAVX2] in
def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
- VR256:$src1, (i8 imm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
+ VR256:$src1, (i8 timm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
//===----------------------------------------------------------------------===//
@@ -7760,7 +7747,7 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
int_x86_avx2_maskstore_q_256>, VEX_W;
multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
- ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
+ ValueType MaskVT> {
// masked store
def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
@@ -7772,23 +7759,23 @@ multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
}
let Predicates = [HasAVX] in {
- defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
- defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
- defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
- defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
}
let Predicates = [HasAVX1Only] in {
// load/store i32/i64 not supported use ps/pd version
- defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
- defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
- defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
- defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
+ defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
+ defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
+ defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
}
let Predicates = [HasAVX2] in {
- defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
- defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
- defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
- defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
+ defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
+ defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
+ defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
+ defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
}
//===----------------------------------------------------------------------===//
@@ -7956,13 +7943,13 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), "",
- [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
[(set RC:$dst, (OpVT (OpNode RC:$src1,
(MemOpFrag addr:$src2),
- imm:$src3)))], SSEPackedInt>,
+ timm:$src3)))], SSEPackedInt>,
Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
}
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 7050e1917494..7f41feb6c0d9 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -43,7 +43,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>;
let SchedRW = [WriteSystem] in {
def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap",
- [(int_x86_int imm:$trap)]>;
+ [(int_x86_int timm:$trap)]>;
def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index fc0da845299f..3a1212342a13 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -45,7 +45,7 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins),
def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
"xabort\t$imm",
- [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+ [(int_x86_xabort timm:$imm)]>, Requires<[HasRTM]>;
} // SchedRW
// HLE prefixes
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 66ca78556b82..229af366d940 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -143,13 +143,13 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>,
+ (vt128 (OpNode (vt128 VR128:$src1), timm:$src2)))]>,
XOP, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
+ (vt128 (OpNode (vt128 (load addr:$src1)), timm:$src2)))]>,
XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -251,7 +251,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
"\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
- imm:$cc)))]>,
+ timm:$cc)))]>,
XOP_4V, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$cc),
@@ -260,14 +260,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (load addr:$src2)),
- imm:$cc)))]>,
+ timm:$cc)))]>,
XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
def : Pat<(OpNode (load addr:$src2),
- (vt128 VR128:$src1), imm:$cc),
+ (vt128 VR128:$src1), timm:$cc),
(!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
- (CommuteVPCOMCC imm:$cc))>;
+ (CommuteVPCOMCC timm:$cc))>;
}
defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>;
@@ -418,27 +418,27 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
X86FoldableSchedWrite sched> {
def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
+ (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
- (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>,
+ (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 timm:$src4))))]>,
Sched<[sched]>;
def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
+ (ins RC:$src1, RC:$src2, intmemop:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
- (i8 imm:$src4))))]>, VEX_W,
+ (i8 timm:$src4))))]>, VEX_W,
Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
+ (ins RC:$src1, fpmemop:$src2, RC:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
- RC:$src3, (i8 imm:$src4))))]>,
+ RC:$src3, (i8 timm:$src4))))]>,
Sched<[sched.Folded, sched.ReadAfterFold,
// fpmemop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -447,7 +447,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
+ (ins RC:$src1, RC:$src2, RC:$src3, u4imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[]>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 892a083f4d1a..01620b7b64c9 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -60,7 +60,7 @@ public:
X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
const X86RegisterBankInfo &RBI);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName() { return DEBUG_TYPE; }
private:
@@ -94,11 +94,9 @@ private:
MachineFunction &MF) const;
bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const;
+ MachineFunction &MF);
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const;
+ MachineFunction &MF);
bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -217,7 +215,7 @@ static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
}
static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
- assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
if (X86::GR64RegClass.contains(Reg))
return &X86::GR64RegClass;
if (X86::GR32RegClass.contains(Reg))
@@ -233,15 +231,15 @@ static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
// Set X86 Opcode and constrain DestReg.
bool X86InstructionSelector::selectCopy(MachineInstr &I,
MachineRegisterInfo &MRI) const {
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
- unsigned SrcReg = I.getOperand(1).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
- if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+ if (Register::isPhysicalRegister(DstReg)) {
assert(I.isCopy() && "Generic operators do not allow physical registers");
if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
@@ -253,7 +251,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
if (SrcRC != DstRC) {
// This case can be generated by ABI lowering, performe anyext
- unsigned ExtSrc = MRI.createVirtualRegister(DstRC);
+ Register ExtSrc = MRI.createVirtualRegister(DstRC);
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(ExtSrc)
@@ -268,12 +266,12 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
- assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+ assert((!Register::isPhysicalRegister(SrcReg) || I.isCopy()) &&
"No phys reg on generic operators");
assert((DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
- (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+ (Register::isPhysicalRegister(SrcReg) &&
DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
"Copy with different width?!");
@@ -282,7 +280,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
if (SrcRegBank.getID() == X86::GPRRegBankID &&
DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
- TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ Register::isPhysicalRegister(SrcReg)) {
// Change the physical register to performe truncate.
const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
@@ -308,8 +306,7 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
-bool X86InstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool X86InstructionSelector::select(MachineInstr &I) {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -333,7 +330,7 @@ bool X86InstructionSelector::select(MachineInstr &I,
assert(I.getNumOperands() == I.getNumExplicitOperands() &&
"Generic instruction has unexpected implicit operands\n");
- if (selectImpl(I, CoverageInfo))
+ if (selectImpl(I, *CoverageInfo))
return true;
LLVM_DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
@@ -370,10 +367,10 @@ bool X86InstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_UADDE:
return selectUadde(I, MRI, MF);
case TargetOpcode::G_UNMERGE_VALUES:
- return selectUnmergeValues(I, MRI, MF, CoverageInfo);
+ return selectUnmergeValues(I, MRI, MF);
case TargetOpcode::G_MERGE_VALUES:
case TargetOpcode::G_CONCAT_VECTORS:
- return selectMergeValues(I, MRI, MF, CoverageInfo);
+ return selectMergeValues(I, MRI, MF);
case TargetOpcode::G_EXTRACT:
return selectExtract(I, MRI, MF);
case TargetOpcode::G_INSERT:
@@ -512,7 +509,7 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) &&
"unexpected instruction");
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
@@ -572,7 +569,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) &&
"unexpected instruction");
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
// Use LEA to calculate frame index and GEP
@@ -625,7 +622,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
AM.Base.Reg = X86::RIP;
}
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
unsigned NewOpc = getLeaOP(Ty, STI);
@@ -644,7 +641,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
assert((I.getOpcode() == TargetOpcode::G_CONSTANT) &&
"unexpected instruction");
- const unsigned DefReg = I.getOperand(0).getReg();
+ const Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID)
@@ -717,8 +714,8 @@ bool X86InstructionSelector::selectTruncOrPtrToInt(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_PTRTOINT) &&
"unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
@@ -781,8 +778,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
@@ -892,8 +889,8 @@ bool X86InstructionSelector::selectAnyext(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
@@ -952,8 +949,8 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
(CmpInst::Predicate)I.getOperand(1).getPredicate());
- unsigned LHS = I.getOperand(2).getReg();
- unsigned RHS = I.getOperand(3).getReg();
+ Register LHS = I.getOperand(2).getReg();
+ Register RHS = I.getOperand(3).getReg();
if (SwapArgs)
std::swap(LHS, RHS);
@@ -998,8 +995,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_FCMP) && "unexpected instruction");
- unsigned LhsReg = I.getOperand(2).getReg();
- unsigned RhsReg = I.getOperand(3).getReg();
+ Register LhsReg = I.getOperand(2).getReg();
+ Register RhsReg = I.getOperand(3).getReg();
CmpInst::Predicate Predicate =
(CmpInst::Predicate)I.getOperand(1).getPredicate();
@@ -1033,7 +1030,7 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
break;
}
- unsigned ResultReg = I.getOperand(0).getReg();
+ Register ResultReg = I.getOperand(0).getReg();
RBI.constrainGenericRegister(
ResultReg,
*getRegClass(LLT::scalar(8), *RBI.getRegBank(ResultReg, MRI, TRI)), MRI);
@@ -1043,8 +1040,8 @@ bool X86InstructionSelector::selectFCmp(MachineInstr &I,
.addReg(LhsReg)
.addReg(RhsReg);
- unsigned FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
- unsigned FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register FlagReg1 = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register FlagReg2 = MRI.createVirtualRegister(&X86::GR8RegClass);
MachineInstr &Set1 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]);
MachineInstr &Set2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -1089,11 +1086,11 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned CarryOutReg = I.getOperand(1).getReg();
- const unsigned Op0Reg = I.getOperand(2).getReg();
- const unsigned Op1Reg = I.getOperand(3).getReg();
- unsigned CarryInReg = I.getOperand(4).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register CarryOutReg = I.getOperand(1).getReg();
+ const Register Op0Reg = I.getOperand(2).getReg();
+ const Register Op1Reg = I.getOperand(3).getReg();
+ Register CarryInReg = I.getOperand(4).getReg();
const LLT DstTy = MRI.getType(DstReg);
@@ -1149,8 +1146,8 @@ bool X86InstructionSelector::selectExtract(MachineInstr &I,
assert((I.getOpcode() == TargetOpcode::G_EXTRACT) &&
"unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
int64_t Index = I.getOperand(2).getImm();
const LLT DstTy = MRI.getType(DstReg);
@@ -1281,9 +1278,9 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
- const unsigned InsertReg = I.getOperand(2).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
+ const Register InsertReg = I.getOperand(2).getReg();
int64_t Index = I.getOperand(3).getImm();
const LLT DstTy = MRI.getType(DstReg);
@@ -1335,14 +1332,13 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
}
bool X86InstructionSelector::selectUnmergeValues(
- MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) &&
"unexpected instruction");
// Split to extracts.
unsigned NumDefs = I.getNumOperands() - 1;
- unsigned SrcReg = I.getOperand(NumDefs).getReg();
+ Register SrcReg = I.getOperand(NumDefs).getReg();
unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
@@ -1352,7 +1348,7 @@ bool X86InstructionSelector::selectUnmergeValues(
.addReg(SrcReg)
.addImm(Idx * DefSize);
- if (!select(ExtrInst, CoverageInfo))
+ if (!select(ExtrInst))
return false;
}
@@ -1361,15 +1357,14 @@ bool X86InstructionSelector::selectUnmergeValues(
}
bool X86InstructionSelector::selectMergeValues(
- MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
- CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) {
assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES ||
I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS) &&
"unexpected instruction");
// Split to inserts.
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg0 = I.getOperand(1).getReg();
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg0 = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg0);
@@ -1378,13 +1373,13 @@ bool X86InstructionSelector::selectMergeValues(
const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
// For the first src use insertSubReg.
- unsigned DefReg = MRI.createGenericVirtualRegister(DstTy);
+ Register DefReg = MRI.createGenericVirtualRegister(DstTy);
MRI.setRegBank(DefReg, RegBank);
if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
return false;
for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
- unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
+ Register Tmp = MRI.createGenericVirtualRegister(DstTy);
MRI.setRegBank(Tmp, RegBank);
MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -1395,7 +1390,7 @@ bool X86InstructionSelector::selectMergeValues(
DefReg = Tmp;
- if (!select(InsertInst, CoverageInfo))
+ if (!select(InsertInst))
return false;
}
@@ -1403,7 +1398,7 @@ bool X86InstructionSelector::selectMergeValues(
TII.get(TargetOpcode::COPY), DstReg)
.addReg(DefReg);
- if (!select(CopyInst, CoverageInfo))
+ if (!select(CopyInst))
return false;
I.eraseFromParent();
@@ -1415,7 +1410,7 @@ bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
MachineFunction &MF) const {
assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction");
- const unsigned CondReg = I.getOperand(0).getReg();
+ const Register CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
MachineInstr &TestInst =
@@ -1442,7 +1437,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
if (CM != CodeModel::Small && CM != CodeModel::Large)
return false;
- const unsigned DstReg = I.getOperand(0).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
unsigned Align = DstTy.getSizeInBits();
@@ -1460,7 +1455,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
// Under X86-64 non-small code model, GV (and friends) are 64-bits, so
// they cannot be folded into immediate fields.
- unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
+ Register AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg)
.addConstantPoolIndex(CPI, 0, OpFlag);
@@ -1503,7 +1498,7 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
I.getOpcode() == TargetOpcode::G_PHI) &&
"unexpected instruction");
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
if (!MRI.getRegClassOrNull(DstReg)) {
const LLT DstTy = MRI.getType(DstReg);
@@ -1537,7 +1532,7 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_LSHR) &&
"unexpected instruction");
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
@@ -1578,8 +1573,8 @@ bool X86InstructionSelector::selectShift(MachineInstr &I,
return false;
}
- unsigned Op0Reg = I.getOperand(1).getReg();
- unsigned Op1Reg = I.getOperand(2).getReg();
+ Register Op0Reg = I.getOperand(1).getReg();
+ Register Op1Reg = I.getOperand(2).getReg();
assert(MRI.getType(Op1Reg).getSizeInBits() == 8);
@@ -1606,9 +1601,9 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
I.getOpcode() == TargetOpcode::G_UREM) &&
"unexpected instruction");
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned Op1Reg = I.getOperand(1).getReg();
- const unsigned Op2Reg = I.getOperand(2).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register Op1Reg = I.getOperand(1).getReg();
+ const Register Op2Reg = I.getOperand(2).getReg();
const LLT RegTy = MRI.getType(DstReg);
assert(RegTy == MRI.getType(Op1Reg) && RegTy == MRI.getType(Op2Reg) &&
@@ -1732,7 +1727,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(OpEntry.OpSignExtend));
else {
- unsigned Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
+ Register Zero32 = MRI.createVirtualRegister(&X86::GR32RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::MOV32r0),
Zero32);
@@ -1770,8 +1765,8 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
if ((I.getOpcode() == Instruction::SRem ||
I.getOpcode() == Instruction::URem) &&
OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) {
- unsigned SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
- unsigned ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg)
.addReg(X86::AX);
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 40141d894629..1d7adbaa9e99 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t {
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
INTR_TYPE_3OP_IMM8,
- CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV,
+ CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI,
CVTPD2PS_MASK,
INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE,
INTR_TYPE_1OP_MASK_SAE, INTR_TYPE_2OP_MASK_SAE, INTR_TYPE_3OP_MASK_SAE,
@@ -1101,8 +1101,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
- X86_INTRINSIC_DATA(tbm_bextri_u32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
- X86_INTRINSIC_DATA(tbm_bextri_u64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0),
+ X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 00fb1b573858..04121f863c89 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -13,6 +13,7 @@
#include "X86LegalizerInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
@@ -84,6 +85,24 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
verify(*STI.getInstrInfo());
}
+bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ switch (MI.getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ LegalizerHelper::UnableToLegalize)
+ return false;
+ MI.eraseFromParent();
+ return true;
+ default:
+ break;
+ }
+ return true;
+}
+
void X86LegalizerInfo::setLegalizerInfo32bit() {
const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0));
@@ -158,6 +177,7 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_ANYEXT, Ty}, Legal);
}
setAction({G_ANYEXT, s128}, Legal);
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
// Comparison
setAction({G_ICMP, s1}, Legal);
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index d21707b9ab9b..7a0f13fb5ae6 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -32,6 +32,9 @@ private:
public:
X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
+ bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
private:
void setLegalizerInfo32bit();
void setLegalizerInfo64bit();
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index b1fefaa84be4..78098fd6262f 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -427,6 +427,41 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
}
}
+// Replace TAILJMP opcodes with their equivalent opcodes that have encoding
+// information.
+static unsigned convertTailJumpOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::TAILJMPr:
+ Opcode = X86::JMP32r;
+ break;
+ case X86::TAILJMPm:
+ Opcode = X86::JMP32m;
+ break;
+ case X86::TAILJMPr64:
+ Opcode = X86::JMP64r;
+ break;
+ case X86::TAILJMPm64:
+ Opcode = X86::JMP64m;
+ break;
+ case X86::TAILJMPr64_REX:
+ Opcode = X86::JMP64r_REX;
+ break;
+ case X86::TAILJMPm64_REX:
+ Opcode = X86::JMP64m_REX;
+ break;
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ Opcode = X86::JMP_1;
+ break;
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPd64_CC:
+ Opcode = X86::JCC_1;
+ break;
+ }
+
+ return Opcode;
+}
+
void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
@@ -500,21 +535,190 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
break;
}
- // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
- // inputs modeled as normal uses instead of implicit uses. As such, truncate
- // off all but the first operand (the callee). FIXME: Change isel.
- case X86::TAILJMPr64:
- case X86::TAILJMPr64_REX:
- case X86::CALL64r:
- case X86::CALL64pcrel32: {
- unsigned Opcode = OutMI.getOpcode();
- MCOperand Saved = OutMI.getOperand(0);
- OutMI = MCInst();
- OutMI.setOpcode(Opcode);
- OutMI.addOperand(Saved);
+ case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik:
+ case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik:
+ case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik:
+ case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik:
+ case X86::VPCMPBZrmi: case X86::VPCMPBZrmik:
+ case X86::VPCMPBZrri: case X86::VPCMPBZrrik:
+ case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik:
+ case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk:
+ case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik:
+ case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik:
+ case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk:
+ case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik:
+ case X86::VPCMPDZrmi: case X86::VPCMPDZrmik:
+ case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk:
+ case X86::VPCMPDZrri: case X86::VPCMPDZrrik:
+ case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik:
+ case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk:
+ case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik:
+ case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik:
+ case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk:
+ case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik:
+ case X86::VPCMPQZrmi: case X86::VPCMPQZrmik:
+ case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk:
+ case X86::VPCMPQZrri: case X86::VPCMPQZrrik:
+ case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik:
+ case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik:
+ case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik:
+ case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik:
+ case X86::VPCMPWZrmi: case X86::VPCMPWZrmik:
+ case X86::VPCMPWZrri: case X86::VPCMPWZrrik: {
+ // Turn immediate 0 into the VPCMPEQ instruction.
+ if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break;
+ }
+
+ OutMI.setOpcode(NewOpc);
+ OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+ break;
+ }
+
+ // Turn immediate 6 into the VPCMPGT instruction.
+ if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break;
+ case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break;
+ case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break;
+ case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break;
+ case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break;
+ case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break;
+ case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break;
+ case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break;
+ case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break;
+ case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break;
+ case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break;
+ case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break;
+ case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break;
+ case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break;
+ case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break;
+ case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break;
+ case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break;
+ case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break;
+ case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break;
+ case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break;
+ case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break;
+ case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break;
+ case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break;
+ case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break;
+ case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break;
+ case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break;
+ case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break;
+ case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break;
+ case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break;
+ case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break;
+ case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break;
+ case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break;
+ case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break;
+ case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break;
+ case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break;
+ case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break;
+ case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break;
+ case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break;
+ case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break;
+ case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break;
+ case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break;
+ case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break;
+ case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break;
+ case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break;
+ case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break;
+ case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break;
+ case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break;
+ case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break;
+ case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break;
+ case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break;
+ case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break;
+ case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break;
+ case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break;
+ case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break;
+ case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break;
+ case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break;
+ case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break;
+ case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break;
+ case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break;
+ case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break;
+ }
+
+ OutMI.setOpcode(NewOpc);
+ OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1));
+ break;
+ }
+
break;
}
+ // CALL64r, CALL64pcrel32 - These instructions used to have
+ // register inputs modeled as normal uses instead of implicit uses. As such,
+ // they we used to truncate off all but the first operand (the callee). This
+ // issue seems to have been fixed at some point. This assert verifies that.
+ case X86::CALL64r:
+ case X86::CALL64pcrel32:
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+ break;
+
case X86::EH_RETURN:
case X86::EH_RETURN64: {
OutMI = MCInst();
@@ -539,36 +743,30 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
break;
}
- // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
- // instruction.
- {
- unsigned Opcode;
- case X86::TAILJMPr:
- Opcode = X86::JMP32r;
- goto SetTailJmpOpcode;
- case X86::TAILJMPd:
- case X86::TAILJMPd64:
- Opcode = X86::JMP_1;
- goto SetTailJmpOpcode;
-
- SetTailJmpOpcode:
- MCOperand Saved = OutMI.getOperand(0);
- OutMI = MCInst();
- OutMI.setOpcode(Opcode);
- OutMI.addOperand(Saved);
- break;
- }
+ // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump
+ // instruction.
+ case X86::TAILJMPr:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64:
+ assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
case X86::TAILJMPd_CC:
- case X86::TAILJMPd64_CC: {
- MCOperand Saved = OutMI.getOperand(0);
- MCOperand Saved2 = OutMI.getOperand(1);
- OutMI = MCInst();
- OutMI.setOpcode(X86::JCC_1);
- OutMI.addOperand(Saved);
- OutMI.addOperand(Saved2);
+ case X86::TAILJMPd64_CC:
+ assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
+ break;
+
+ case X86::TAILJMPm:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPm64_REX:
+ assert(OutMI.getNumOperands() == X86::AddrNumOperands &&
+ "Unexpected number of operands!");
+ OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
break;
- }
case X86::DEC16r:
case X86::DEC32r:
@@ -958,7 +1156,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
// FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
// <opcode>, <operands>
- unsigned DefRegister = FaultingMI.getOperand(0).getReg();
+ Register DefRegister = FaultingMI.getOperand(0).getReg();
FaultMaps::FaultKind FK =
static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
@@ -1079,7 +1277,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
// Emit MOV to materialize the target address and the CALL to target.
// This is encoded with 12-13 bytes, depending on which register is used.
- unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
+ Register ScratchReg = MI.getOperand(ScratchIdx).getReg();
if (X86II::isX86_64ExtendedReg(ScratchReg))
EncodedBytes = 13;
else
@@ -1369,6 +1567,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
recordSled(CurSled, MI, SledKind::TAIL_CALL);
unsigned OpCode = MI.getOperand(0).getImm();
+ OpCode = convertTailJumpOpcode(OpCode);
MCInst TC;
TC.setOpcode(OpCode);
@@ -1538,8 +1737,6 @@ static void printConstant(const Constant *COp, raw_ostream &CS) {
void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
// Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
if (EmitFPOData) {
@@ -1577,17 +1774,16 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
// Otherwise, use the .seh_ directives for all other Windows platforms.
switch (MI->getOpcode()) {
case X86::SEH_PushReg:
- OutStreamer->EmitWinCFIPushReg(
- RI->getSEHRegNum(MI->getOperand(0).getImm()));
+ OutStreamer->EmitWinCFIPushReg(MI->getOperand(0).getImm());
break;
case X86::SEH_SaveReg:
- OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ OutStreamer->EmitWinCFISaveReg(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
case X86::SEH_SaveXMM:
- OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ OutStreamer->EmitWinCFISaveXMM(MI->getOperand(0).getImm(),
MI->getOperand(1).getImm());
break;
@@ -1596,9 +1792,8 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
break;
case X86::SEH_SetFrame:
- OutStreamer->EmitWinCFISetFrame(
- RI->getSEHRegNum(MI->getOperand(0).getImm()),
- MI->getOperand(1).getImm());
+ OutStreamer->EmitWinCFISetFrame(MI->getOperand(0).getImm(),
+ MI->getOperand(1).getImm());
break;
case X86::SEH_PushFrame:
@@ -1650,7 +1845,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::EH_RETURN:
case X86::EH_RETURN64: {
// Lower these as normal, but add some comments.
- unsigned Reg = MI->getOperand(0).getReg();
+ Register Reg = MI->getOperand(0).getReg();
OutStreamer->AddComment(StringRef("eh_return, addr: %") +
X86ATTInstPrinter::getRegisterName(Reg));
break;
@@ -1697,11 +1892,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::MASKPAIR16LOAD: {
int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
- unsigned Reg = MI->getOperand(0).getReg();
- unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
- unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+ Register Reg = MI->getOperand(0).getReg();
+ Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
// Load the first mask register
MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
@@ -1730,11 +1923,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::MASKPAIR16STORE: {
int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
- unsigned Reg = MI->getOperand(X86::AddrNumOperands).getReg();
- unsigned Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
- unsigned Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
+ Register Reg = MI->getOperand(X86::AddrNumOperands).getReg();
+ Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
// Store the first mask register
MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index d7e535598d81..5cb80a082b56 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// is stashed.
signed char RestoreBasePointerOffset = 0;
+ /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame
+ /// in bytes.
+ DenseMap<int, unsigned> WinEHXMMSlotInfo;
+
/// CalleeSavedFrameSize - Size of the callee-saved register portion of the
/// stack frame in bytes.
unsigned CalleeSavedFrameSize = 0;
@@ -120,6 +124,10 @@ public:
void setRestoreBasePointer(const MachineFunction *MF);
int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
+ DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; }
+ const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
+ return WinEHXMMSlotInfo; }
+
unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index 7f75598b0655..1aee01563c4b 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -198,8 +198,7 @@ static inline MemOpKey getMemOpKey(const MachineInstr &MI, unsigned N) {
static inline bool isIdenticalOp(const MachineOperand &MO1,
const MachineOperand &MO2) {
return MO1.isIdenticalTo(MO2) &&
- (!MO1.isReg() ||
- !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+ (!MO1.isReg() || !Register::isPhysicalRegister(MO1.getReg()));
}
#ifndef NDEBUG
@@ -235,9 +234,9 @@ static inline bool isLEA(const MachineInstr &MI) {
namespace {
-class OptimizeLEAPass : public MachineFunctionPass {
+class X86OptimizeLEAPass : public MachineFunctionPass {
public:
- OptimizeLEAPass() : MachineFunctionPass(ID) {}
+ X86OptimizeLEAPass() : MachineFunctionPass(ID) {}
StringRef getPassName() const override { return "X86 LEA Optimize"; }
@@ -246,6 +245,8 @@ public:
/// been calculated by LEA. Also, remove redundant LEAs.
bool runOnMachineFunction(MachineFunction &MF) override;
+ static char ID;
+
private:
using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
@@ -296,18 +297,18 @@ private:
MachineRegisterInfo *MRI;
const X86InstrInfo *TII;
const X86RegisterInfo *TRI;
-
- static char ID;
};
} // end anonymous namespace
-char OptimizeLEAPass::ID = 0;
+char X86OptimizeLEAPass::ID = 0;
-FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
+FunctionPass *llvm::createX86OptimizeLEAs() { return new X86OptimizeLEAPass(); }
+INITIALIZE_PASS(X86OptimizeLEAPass, DEBUG_TYPE, "X86 optimize LEA pass", false,
+ false)
-int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
- const MachineInstr &Last) {
+int X86OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
+ const MachineInstr &Last) {
// Both instructions must be in the same basic block and they must be
// presented in InstrPos.
assert(Last.getParent() == First.getParent() &&
@@ -328,10 +329,9 @@ int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
// 3) Displacement of the new memory operand should fit in 1 byte if possible.
// 4) The LEA should be as close to MI as possible, and prior to it if
// possible.
-bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
- const MachineInstr &MI,
- MachineInstr *&BestLEA,
- int64_t &AddrDispShift, int &Dist) {
+bool X86OptimizeLEAPass::chooseBestLEA(
+ const SmallVectorImpl<MachineInstr *> &List, const MachineInstr &MI,
+ MachineInstr *&BestLEA, int64_t &AddrDispShift, int &Dist) {
const MachineFunction *MF = MI.getParent()->getParent();
const MCInstrDesc &Desc = MI.getDesc();
int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags) +
@@ -387,9 +387,10 @@ bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List,
// Get the difference between the addresses' displacements of the two
// instructions \p MI1 and \p MI2. The numbers of the first memory operands are
// passed through \p N1 and \p N2.
-int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
- const MachineInstr &MI2,
- unsigned N2) const {
+int64_t X86OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1,
+ unsigned N1,
+ const MachineInstr &MI2,
+ unsigned N2) const {
const MachineOperand &Op1 = MI1.getOperand(N1 + X86::AddrDisp);
const MachineOperand &Op2 = MI2.getOperand(N2 + X86::AddrDisp);
@@ -411,9 +412,9 @@ int64_t OptimizeLEAPass::getAddrDispShift(const MachineInstr &MI1, unsigned N1,
// 2) Def registers of LEAs belong to the same class.
// 3) All uses of the Last LEA def register are replaceable, thus the
// register is used only as address base.
-bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
- const MachineInstr &Last,
- int64_t &AddrDispShift) const {
+bool X86OptimizeLEAPass::isReplaceable(const MachineInstr &First,
+ const MachineInstr &Last,
+ int64_t &AddrDispShift) const {
assert(isLEA(First) && isLEA(Last) &&
"The function works only with LEA instructions");
@@ -467,7 +468,8 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
return true;
}
-void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+void X86OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
+ MemOpMap &LEAs) {
unsigned Pos = 0;
for (auto &MI : MBB) {
// Assign the position number to the instruction. Note that we are going to
@@ -485,7 +487,7 @@ void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
// Try to find load and store instructions which recalculate addresses already
// calculated by some LEA and replace their memory operands with its def
// register.
-bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
+bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
bool Changed = false;
assert(!LEAs.empty());
@@ -564,9 +566,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
return Changed;
}
-MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
- unsigned VReg,
- int64_t AddrDispShift) {
+MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
+ unsigned VReg,
+ int64_t AddrDispShift) {
DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
if (AddrDispShift != 0)
Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
@@ -583,7 +585,7 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
}
// Try to find similar LEAs in the list and replace one with another.
-bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
+bool X86OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
bool Changed = false;
// Loop over all entries in the table.
@@ -613,8 +615,8 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
// Loop over all uses of the Last LEA and update their operands. Note
// that the correctness of this has already been checked in the
// isReplaceable function.
- unsigned FirstVReg = First.getOperand(0).getReg();
- unsigned LastVReg = Last.getOperand(0).getReg();
+ Register FirstVReg = First.getOperand(0).getReg();
+ Register LastVReg = Last.getOperand(0).getReg();
for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end();
UI != UE;) {
MachineOperand &MO = *UI++;
@@ -670,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
return Changed;
}
-bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
+bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
if (DisableX86LEAOpt || skipFunction(MF.getFunction()))
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index 78fede3dcde2..daddf4231897 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -46,7 +46,9 @@ const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
if (X86::GR8RegClass.hasSubClassEq(&RC) ||
X86::GR16RegClass.hasSubClassEq(&RC) ||
X86::GR32RegClass.hasSubClassEq(&RC) ||
- X86::GR64RegClass.hasSubClassEq(&RC))
+ X86::GR64RegClass.hasSubClassEq(&RC) ||
+ X86::LOW32_ADDR_ACCESSRegClass.hasSubClassEq(&RC) ||
+ X86::LOW32_ADDR_ACCESS_RBPRegClass.hasSubClassEq(&RC))
return getRegBank(X86::GPRRegBankID);
if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 2e2f1f9e438a..ff625325b4c9 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -544,7 +544,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
"Stack realignment in presence of dynamic allocas is not supported with"
"this calling convention.");
- unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
+ Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
I.isValid(); ++I)
Reserved.set(*I);
@@ -677,13 +677,13 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
MI.getOperand(4).getImm() != 0 ||
MI.getOperand(5).getReg() != X86::NoRegister)
return false;
- unsigned BasePtr = MI.getOperand(1).getReg();
+ Register BasePtr = MI.getOperand(1).getReg();
// In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will
// be replaced with a 32-bit operand MOV which will zero extend the upper
// 32-bits of the super register.
if (Opc == X86::LEA64_32r)
BasePtr = getX86SubSuperRegister(BasePtr, 32);
- unsigned NewDestReg = MI.getOperand(0).getReg();
+ Register NewDestReg = MI.getOperand(0).getReg();
const X86InstrInfo *TII =
MI.getParent()->getParent()->getSubtarget<X86Subtarget>().getInstrInfo();
TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr,
@@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
return true;
}
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::CATCHRET:
+ case X86::CLEANUPRET:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("impossible");
+}
+
void
X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineInstr &MI = *II;
- MachineFunction &MF = *MI.getParent()->getParent();
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false
+ : isFuncletReturnInstr(*MBBI);
const X86FrameLowering *TFI = getFrameLowering(MF);
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
@@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
"Return instruction can only reference SP relative frame objects");
FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+ } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
+ FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
} else {
FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
}
@@ -729,7 +746,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// register as source operand, semantic is the same and destination is
// 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
// Don't change BasePtr since it is used later for stack adjustment.
- unsigned MachineBasePtr = BasePtr;
+ Register MachineBasePtr = BasePtr;
if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
MachineBasePtr = getX86SubSuperRegister(BasePtr, 64);
@@ -773,7 +790,7 @@ Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
unsigned
X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
- unsigned FrameReg = getFrameRegister(MF);
+ Register FrameReg = getFrameRegister(MF);
if (Subtarget.isTarget64BitILP32())
FrameReg = getX86SubSuperRegister(FrameReg, 32);
return FrameReg;
@@ -782,7 +799,7 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const {
unsigned
X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const {
const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
- unsigned StackReg = getStackRegister();
+ Register StackReg = getStackRegister();
if (Subtarget.isTarget64BitILP32())
StackReg = getX86SubSuperRegister(StackReg, 32);
return StackReg;
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index b435b22e8ac7..f8464c7e8298 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -58,8 +58,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineModuleInfo>();
- AU.addPreserved<MachineModuleInfo>();
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.addPreserved<MachineModuleInfoWrapperPass>();
}
private:
@@ -97,7 +97,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
TII = STI->getInstrInfo();
Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
- MMI = &getAnalysis<MachineModuleInfo>();
+ MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
Module &M = const_cast<Module &>(*MMI->getModule());
// If this function is not a thunk, check to see if we need to insert
@@ -279,7 +279,7 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
CallTarget->addLiveIn(Reg);
CallTarget->setHasAddressTaken();
- CallTarget->setAlignment(4);
+ CallTarget->setAlignment(Align(16));
insertRegReturnAddrClobber(*CallTarget, Reg);
CallTarget->back().setPreInstrSymbol(MF, TargetSym);
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
index 7574e4b8f896..9b1fcaa8a13d 100755
--- a/lib/Target/X86/X86SchedBroadwell.td
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -232,8 +232,12 @@ defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+
defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 284d1567c5c6..06f417501b21 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -231,8 +231,12 @@ defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+
defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td
index 41bd776648f7..76001d382a27 100644
--- a/lib/Target/X86/X86SchedPredicates.td
+++ b/lib/Target/X86/X86SchedPredicates.td
@@ -84,3 +84,60 @@ def IsSETAm_Or_SETBEm : CheckAny<[
CheckImmOperand_s<5, "X86::COND_A">,
CheckImmOperand_s<5, "X86::COND_BE">
]>;
+
+// A predicate used to check if an instruction has a LOCK prefix.
+def CheckLockPrefix : CheckFunctionPredicate<
+ "X86_MC::hasLockPrefix",
+ "X86InstrInfo::hasLockPrefix"
+>;
+
+def IsRegRegCompareAndSwap_8 : CheckOpcode<[ CMPXCHG8rr ]>;
+
+def IsRegMemCompareAndSwap_8 : CheckOpcode<[
+ LCMPXCHG8, CMPXCHG8rm
+]>;
+
+def IsRegRegCompareAndSwap_16_32_64 : CheckOpcode<[
+ CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr
+]>;
+
+def IsRegMemCompareAndSwap_16_32_64 : CheckOpcode<[
+ CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm,
+ LCMPXCHG16, LCMPXCHG32, LCMPXCHG64,
+ LCMPXCHG8B, LCMPXCHG16B
+]>;
+
+def IsCompareAndSwap8B : CheckOpcode<[ CMPXCHG8B, LCMPXCHG8B ]>;
+def IsCompareAndSwap16B : CheckOpcode<[ CMPXCHG16B, LCMPXCHG16B ]>;
+
+def IsRegMemCompareAndSwap : CheckOpcode<
+ !listconcat(
+ IsRegMemCompareAndSwap_8.ValidOpcodes,
+ IsRegMemCompareAndSwap_16_32_64.ValidOpcodes
+ )>;
+
+def IsRegRegCompareAndSwap : CheckOpcode<
+ !listconcat(
+ IsRegRegCompareAndSwap_8.ValidOpcodes,
+ IsRegRegCompareAndSwap_16_32_64.ValidOpcodes
+ )>;
+
+def IsAtomicCompareAndSwap_8 : CheckAll<[
+ CheckLockPrefix,
+ IsRegMemCompareAndSwap_8
+]>;
+
+def IsAtomicCompareAndSwap : CheckAll<[
+ CheckLockPrefix,
+ IsRegMemCompareAndSwap
+]>;
+
+def IsAtomicCompareAndSwap8B : CheckAll<[
+ CheckLockPrefix,
+ IsCompareAndSwap8B
+]>;
+
+def IsAtomicCompareAndSwap16B : CheckAll<[
+ CheckLockPrefix,
+ IsCompareAndSwap16B
+]>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index d40bdf728a48..26d4d8fa3549 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -208,8 +208,12 @@ defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+
defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index 8f3e4ae62d53..9a511ecc0071 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+
defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index 58caf1dacfcb..a8c65435ab9b 100755
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+
defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 55ca85ec1e3d..95f710061aeb 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -102,6 +102,12 @@ class X86SchedWriteMoveLS<SchedWrite MoveRR,
SchedWrite MR = StoreMR;
}
+// Multiclass that wraps masked load/store writes for a vector width.
+class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> {
+ SchedWrite RM = LoadRM;
+ SchedWrite MR = StoreMR;
+}
+
// Multiclass that wraps X86SchedWriteMoveLS for each vector width.
class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
X86SchedWriteMoveLS s128,
@@ -218,8 +224,12 @@ def WriteFStoreY : SchedWrite;
def WriteFStoreNT : SchedWrite;
def WriteFStoreNTX : SchedWrite;
def WriteFStoreNTY : SchedWrite;
-def WriteFMaskedStore : SchedWrite;
-def WriteFMaskedStoreY : SchedWrite;
+
+def WriteFMaskedStore32 : SchedWrite;
+def WriteFMaskedStore64 : SchedWrite;
+def WriteFMaskedStore32Y : SchedWrite;
+def WriteFMaskedStore64Y : SchedWrite;
+
def WriteFMove : SchedWrite;
def WriteFMoveX : SchedWrite;
def WriteFMoveY : SchedWrite;
@@ -530,6 +540,16 @@ def SchedWriteVecMoveLSNT
: X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
+// Conditional SIMD Packed Loads and Stores wrappers.
+def WriteFMaskMove32
+ : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>;
+def WriteFMaskMove64
+ : X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>;
+def WriteFMaskMove32Y
+ : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
+def WriteFMaskMove64Y
+ : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
+
// Vector width wrappers.
def SchedWriteFAdd
: X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index b0334655de7e..78acb1065ec8 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -216,8 +216,10 @@ defm : X86WriteResUnsupported<WriteFStoreY>;
def : WriteRes<WriteFStoreNT, [AtomPort0]>;
def : WriteRes<WriteFStoreNTX, [AtomPort0]>;
defm : X86WriteResUnsupported<WriteFStoreNTY>;
-defm : X86WriteResUnsupported<WriteFMaskedStore>;
-defm : X86WriteResUnsupported<WriteFMaskedStoreY>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32>;
+defm : X86WriteResUnsupported<WriteFMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64>;
+defm : X86WriteResUnsupported<WriteFMaskedStore64Y>;
def : WriteRes<WriteFMove, [AtomPort01]>;
def : WriteRes<WriteFMoveX, [AtomPort01]>;
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
index 8cc01c3acece..d7aea3cf4e9d 100644
--- a/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -726,8 +726,10 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
-defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
-defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
+defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
+defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 2d26232b4132..d0421d94ee05 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -180,9 +180,11 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
// Instructions that have local forwarding disabled have an extra +1cy latency.
-// A folded store needs a cycle on the SAGU for the store data,
-// most RMW instructions don't need an extra uop.
-defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
+// A folded store needs a cycle on the SAGU for the store data, most RMW
+// instructions don't need an extra uop. ALU RMW operations don't seem to
+// benefit from STLF, and their observed latency is 6cy. That is the reason why
+// this write adds two extra cycles (instead of just 1cy for the store).
+defm : X86WriteRes<WriteRMW, [JSAGU], 2, [1], 0>;
////////////////////////////////////////////////////////////////////////////////
// Arithmetic.
@@ -191,22 +193,22 @@ defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
defm : JWriteResIntPair<WriteALU, [JALU01], 1>;
defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>;
-defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>;
-defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>;
-defm : X86WriteRes<WriteXCHG, [JALU01], 1, [1], 1>;
-
-defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>;
-defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>;
-defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>;
-defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>;
+defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [JALU01], 3, [3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW, [JALU01, JSAGU, JLAGU], 11, [3, 1, 1], 6>;
+defm : X86WriteRes<WriteXCHG, [JALU01], 1, [2], 2>;
+
+defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 3], 3>;
+defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 4, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 2], 2>;
+defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
+defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>;
+defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
+defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>;
defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>;
@@ -305,6 +307,192 @@ def : WriteRes<WriteFence, [JSAGU]>;
// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; }
+def JWriteCMPXCHG8rr : SchedWriteRes<[JALU01]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+def JWriteLOCK_CMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [3,16,16];
+ let NumMicroOps = 5;
+}
+
+def JWriteLOCK_CMPXCHGrm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 17;
+ let ResourceCycles = [3,17,17];
+ let NumMicroOps = 6;
+}
+
+def JWriteCMPXCHG8rm : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [3,1,1];
+ let NumMicroOps = 5;
+}
+
+def JWriteCMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [3,1,1];
+ let NumMicroOps = 18;
+}
+
+def JWriteCMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 32;
+ let ResourceCycles = [6,1,1];
+ let NumMicroOps = 28;
+}
+
+def JWriteLOCK_CMPXCHG8B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 19;
+ let ResourceCycles = [3,19,19];
+ let NumMicroOps = 18;
+}
+
+def JWriteLOCK_CMPXCHG16B : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 38;
+ let ResourceCycles = [6,38,38];
+ let NumMicroOps = 28;
+}
+
+def JWriteCMPXCHGVariant : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap8B>, [JWriteLOCK_CMPXCHG8B]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap16B>, [JWriteLOCK_CMPXCHG16B]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap_8>, [JWriteLOCK_CMPXCHG8rm]>,
+ SchedVar<MCSchedPredicate<IsAtomicCompareAndSwap>, [JWriteLOCK_CMPXCHGrm]>,
+ SchedVar<MCSchedPredicate<IsCompareAndSwap8B>, [JWriteCMPXCHG8B]>,
+ SchedVar<MCSchedPredicate<IsCompareAndSwap16B>, [JWriteCMPXCHG16B]>,
+ SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap_8>, [JWriteCMPXCHG8rm]>,
+ SchedVar<MCSchedPredicate<IsRegMemCompareAndSwap>, [WriteCMPXCHGRMW]>,
+ SchedVar<MCSchedPredicate<IsRegRegCompareAndSwap_8>, [JWriteCMPXCHG8rr]>,
+ SchedVar<NoSchedPred, [WriteCMPXCHG]>
+]>;
+
+// The first five reads are contributed by the memory load operand.
+// We ignore those reads and set a read-advance for the other input operands
+// including the implicit read of RAX.
+def : InstRW<[JWriteCMPXCHGVariant,
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8, LCMPXCHG16,
+ LCMPXCHG32, LCMPXCHG64,
+ CMPXCHG8rm, CMPXCHG16rm,
+ CMPXCHG32rm, CMPXCHG64rm)>;
+
+def : InstRW<[JWriteCMPXCHGVariant], (instrs CMPXCHG8rr, CMPXCHG16rr,
+ CMPXCHG32rr, CMPXCHG64rr)>;
+
+def : InstRW<[JWriteCMPXCHGVariant,
+ // Ignore reads contributed by the memory operand.
+ ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+ // Add a read-advance to every implicit register read.
+ ReadAfterLd, ReadAfterLd, ReadAfterLd, ReadAfterLd], (instrs LCMPXCHG8B, LCMPXCHG16B,
+ CMPXCHG8B, CMPXCHG16B)>;
+
+def JWriteLOCK_ALURMW : SchedWriteRes<[JALU01, JLAGU, JSAGU]> {
+ let Latency = 19;
+ let ResourceCycles = [1,19,19];
+ let NumMicroOps = 1;
+}
+
+def JWriteLOCK_ALURMWVariant : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_ALURMW]>,
+ SchedVar<NoSchedPred, [WriteALURMW]>
+]>;
+def : InstRW<[JWriteLOCK_ALURMWVariant], (instrs INC8m, INC16m, INC32m, INC64m,
+ DEC8m, DEC16m, DEC32m, DEC64m,
+ NOT8m, NOT16m, NOT32m, NOT64m,
+ NEG8m, NEG16m, NEG32m, NEG64m)>;
+
+def JWriteXCHG8rr_XADDrr : SchedWriteRes<[JALU01]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[JWriteXCHG8rr_XADDrr], (instrs XCHG8rr, XADD8rr, XADD16rr,
+ XADD32rr, XADD64rr)>;
+
+// This write defines the latency of the in/out register operand of a non-atomic
+// XADDrm. This is the first of a pair of writes that model non-atomic
+// XADDrm instructions (the second write definition is JWriteXADDrm_LdSt_Part).
+//
+// We need two writes because the instruction latency differs from the output
+// register operand latency. In particular, the first write describes the first
+// (and only) output register operand of the instruction. However, the
+// instruction latency is set to the MAX of all the write latencies. That's why
+// a second write is needed in this case (see example below).
+//
+// Example:
+// XADD %ecx, (%rsp) ## Instruction latency: 11cy
+// ## ECX write Latency: 3cy
+//
+// Register ECX becomes available in 3 cycles. That is because the value of ECX
+// is exchanged with the value read from the stack pointer, and the load-to-use
+// latency is assumed to be 3cy.
+def JWriteXADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 3; // load-to-use latency
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XADDrm. This is the first of a sequence of two writes used to model atomic
+// XADD instructions. The second write of the sequence is JWriteXCHGrm_LdSt_Part.
+//
+//
+// Example:
+// LOCK XADD %ecx, (%rsp) ## Instruction Latency: 16cy
+// ## ECX write Latency: 11cy
+//
+// The value of ECX becomes available only after 11cy from the start of
+// execution. This write is used to specifically set that operand latency.
+def JWriteLOCK_XADDrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+ let NumMicroOps = 3;
+}
+
+// This write defines the latency of the in/out register operand of an atomic
+// XCHGrm. This write is the first of a sequence of two writes that describe
+// atomic XCHG operations. We need two writes because the instruction latency
+// differs from the output register write latency. We want to make sure that
+// the output register operand becomes visible after 11cy. However, we want to
+// set the instruction latency to 16cy.
+def JWriteXCHGrm_XCHG_Part : SchedWriteRes<[JALU01]> {
+ let Latency = 11;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+
+def JWriteXADDrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 1;
+}
+
+def JWriteXCHGrm_LdSt_Part : SchedWriteRes<[JLAGU, JSAGU]> {
+ let Latency = 16;
+ let ResourceCycles = [16, 16];
+ let NumMicroOps = 1;
+}
+
+def JWriteXADDrm_Part1 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteLOCK_XADDrm_XCHG_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_XCHG_Part]>
+]>;
+
+def JWriteXADDrm_Part2 : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckLockPrefix>, [JWriteXCHGrm_LdSt_Part]>,
+ SchedVar<NoSchedPred, [JWriteXADDrm_LdSt_Part]>
+]>;
+
+def : InstRW<[JWriteXADDrm_Part1, JWriteXADDrm_Part2, ReadAfterLd],
+ (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm,
+ LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def : InstRW<[JWriteXCHGrm_XCHG_Part, JWriteXCHGrm_LdSt_Part, ReadAfterLd],
+ (instrs XCHG8rm, XCHG16rm, XCHG32rm, XCHG64rm)>;
+
+
////////////////////////////////////////////////////////////////////////////////
// Floating point. This covers both scalar and vector operations.
////////////////////////////////////////////////////////////////////////////////
@@ -313,19 +501,22 @@ defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>;
defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>;
defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>;
defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFLoadX, [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [JLAGU], 5, [2], 2>;
defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>;
defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>;
defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>;
-defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
-defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
+defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>;
defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>;
@@ -466,8 +657,8 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
////////////////////////////////////////////////////////////////////////////////
defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [JLAGU], 5, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [JLAGU], 5, [2], 2>;
defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>;
@@ -475,7 +666,7 @@ defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4],
defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
-defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>;
defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
@@ -631,6 +822,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> {
def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
///////////////////////////////////////////////////////////////////////////////
+// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
+///////////////////////////////////////////////////////////////////////////////
+
+def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
+ let Latency = 34;
+ let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
+ let NumMicroOps = 63;
+}
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
+ VMASKMOVDQU, VMASKMOVDQU64)>;
+
+///////////////////////////////////////////////////////////////////////////////
// SchedWriteVariant definitions.
///////////////////////////////////////////////////////////////////////////////
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 34c251a5c5bb..8e3ce721f1a1 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -186,8 +186,12 @@ def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>;
-def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>;
-def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>;
+
+def : WriteRes<WriteFMaskedStore32, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore32Y, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore64, [SLM_MEC_RSV]>;
+def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>;
+
def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>;
def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>;
def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>;
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index 65f6d89df610..06201f4a3a84 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -268,8 +268,12 @@ defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>;
-defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
+defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+
defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 50690953eef5..1ae8df977f83 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -36,7 +36,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
DAG.getSubtarget().getRegisterInfo());
- unsigned BaseReg = TRI->getBaseRegister();
+ Register BaseReg = TRI->getBaseRegister();
for (unsigned R : ClobberSet)
if (BaseReg == R)
return true;
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 40f5dbe57e4b..b8980789258e 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -477,7 +477,7 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
// Otherwise, just build the predicate state itself by zeroing a register
// as we don't need any initial state.
PS->InitialReg = MRI->createVirtualRegister(PS->RC);
- unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
PredStateSubReg);
++NumInstsInserted;
@@ -750,7 +750,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
- unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
// Note that we intentionally use an empty debug location so that
// this picks up the preceding location.
auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
@@ -907,7 +907,7 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
MI.dump(); dbgs() << "\n");
report_fatal_error("Unable to unfold load!");
}
- unsigned Reg = MRI->createVirtualRegister(UnfoldedRC);
+ Register Reg = MRI->createVirtualRegister(UnfoldedRC);
SmallVector<MachineInstr *, 2> NewMIs;
// If we were able to compute an unfolded reg class, any failure here
// is just a programming error so just assert.
@@ -1102,7 +1102,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
// synthetic target in the predecessor. We do this at the bottom of the
// predecessor.
auto InsertPt = Pred->getFirstTerminator();
- unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
if (MF.getTarget().getCodeModel() == CodeModel::Small &&
!Subtarget->isPositionIndependent()) {
// Directly materialize it into an immediate.
@@ -1153,7 +1153,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
} else {
// Otherwise compute the address into a register first.
- unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ Register AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
auto AddrI =
BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
.addReg(/*Base*/ X86::RIP)
@@ -1175,7 +1175,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
// Now cmov over the predicate if the comparison wasn't equal.
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
- unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI =
BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
.addReg(PS->InitialReg)
@@ -1878,7 +1878,7 @@ unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
DebugLoc Loc) {
// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
// what instruction selection does.
- unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ Register Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
// We directly copy the FLAGS register and rely on later lowering to clean
// this up into the appropriate setCC instructions.
BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
@@ -1905,7 +1905,7 @@ void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
unsigned PredStateReg) {
- unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+ Register TmpReg = MRI->createVirtualRegister(PS->RC);
// FIXME: This hard codes a shift distance based on the number of bits needed
// to stay canonical on 64-bit. We should compute this somehow and support
// 32-bit as part of that.
@@ -1925,8 +1925,8 @@ void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
DebugLoc Loc) {
- unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
- unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
+ Register PredStateReg = MRI->createVirtualRegister(PS->RC);
+ Register TmpReg = MRI->createVirtualRegister(PS->RC);
// We know that the stack pointer will have any preserved predicate state in
// its high bit. We just want to smear this across the other bits. Turns out,
@@ -2031,9 +2031,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
}
for (MachineOperand *Op : HardenOpRegs) {
- unsigned OpReg = Op->getReg();
+ Register OpReg = Op->getReg();
auto *OpRC = MRI->getRegClass(OpReg);
- unsigned TmpReg = MRI->createVirtualRegister(OpRC);
+ Register TmpReg = MRI->createVirtualRegister(OpRC);
// If this is a vector register, we'll need somewhat custom logic to handle
// hardening it.
@@ -2045,7 +2045,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
// Move our state into a vector register.
// FIXME: We could skip this at the cost of longer encodings with AVX-512
// but that doesn't seem likely worth it.
- unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
+ Register VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
auto MovI =
BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
.addReg(StateReg);
@@ -2054,7 +2054,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
// Broadcast it across the vector register.
- unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
+ Register VBStateReg = MRI->createVirtualRegister(OpRC);
auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
TII->get(Is128Bit ? X86::VPBROADCASTQrr
: X86::VPBROADCASTQYrr),
@@ -2084,7 +2084,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
// Broadcast our state into a vector register.
- unsigned VStateReg = MRI->createVirtualRegister(OpRC);
+ Register VStateReg = MRI->createVirtualRegister(OpRC);
unsigned BroadcastOp =
Is128Bit ? X86::VPBROADCASTQrZ128r
: Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
@@ -2153,7 +2153,7 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// See if we can sink hardening the loaded value.
auto SinkCheckToSingleUse =
[&](MachineInstr &MI) -> Optional<MachineInstr *> {
- unsigned DefReg = MI.getOperand(0).getReg();
+ Register DefReg = MI.getOperand(0).getReg();
// We need to find a single use which we can sink the check. We can
// primarily do this because many uses may already end up checked on their
@@ -2210,8 +2210,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// If this register isn't a virtual register we can't walk uses of sanely,
// just bail. Also check that its register class is one of the ones we
// can harden.
- unsigned UseDefReg = UseMI.getOperand(0).getReg();
- if (!TRI->isVirtualRegister(UseDefReg) ||
+ Register UseDefReg = UseMI.getOperand(0).getReg();
+ if (!Register::isVirtualRegister(UseDefReg) ||
!canHardenRegister(UseDefReg))
return {};
@@ -2241,6 +2241,9 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
// We don't support post-load hardening of vectors.
return false;
+ unsigned RegIdx = Log2_32(RegBytes);
+ assert(RegIdx < 4 && "Unsupported register size");
+
// If this register class is explicitly constrained to a class that doesn't
// require REX prefix, we may not be able to satisfy that constraint when
// emitting the hardening instructions, so bail out here.
@@ -2251,13 +2254,13 @@ bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
const TargetRegisterClass *NOREXRegClasses[] = {
&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
- if (RC == NOREXRegClasses[Log2_32(RegBytes)])
+ if (RC == NOREXRegClasses[RegIdx])
return false;
const TargetRegisterClass *GPRRegClasses[] = {
&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
&X86::GR64RegClass};
- return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
+ return RC->hasSuperClassEq(GPRRegClasses[RegIdx]);
}
/// Harden a value in a register.
@@ -2278,7 +2281,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
DebugLoc Loc) {
assert(canHardenRegister(Reg) && "Cannot harden this register!");
- assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
+ assert(Register::isVirtualRegister(Reg) && "Cannot harden a physical register!");
auto *RC = MRI->getRegClass(Reg);
int Bytes = TRI->getRegSizeInBits(*RC) / 8;
@@ -2289,7 +2292,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
if (Bytes != 8) {
unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
- unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
+ Register NarrowStateReg = MRI->createVirtualRegister(RC);
BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
.addReg(StateReg, 0, SubRegImm);
StateReg = NarrowStateReg;
@@ -2299,7 +2302,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
if (isEFLAGSLive(MBB, InsertPt, *TRI))
FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
- unsigned NewReg = MRI->createVirtualRegister(RC);
+ Register NewReg = MRI->createVirtualRegister(RC);
unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
@@ -2329,13 +2332,13 @@ unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
DebugLoc Loc = MI.getDebugLoc();
auto &DefOp = MI.getOperand(0);
- unsigned OldDefReg = DefOp.getReg();
+ Register OldDefReg = DefOp.getReg();
auto *DefRC = MRI->getRegClass(OldDefReg);
// Because we want to completely replace the uses of this def'ed value with
// the hardened value, create a dedicated new register that will only be used
// to communicate the unhardened value to the hardening.
- unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
+ Register UnhardenedReg = MRI->createVirtualRegister(DefRC);
DefOp.setReg(UnhardenedReg);
// Now harden this register's value, getting a hardened reg that is safe to
@@ -2537,7 +2540,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
.addReg(ExpectedRetAddrReg, RegState::Kill)
.addSym(RetSymbol);
} else {
- unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
+ Register ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
.addReg(/*Base*/ X86::RIP)
.addImm(/*Scale*/ 1)
@@ -2554,7 +2557,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
auto CMovOp = X86::getCMovOpcode(PredStateSizeInBytes);
- unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+ Register UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
.addReg(NewStateReg, RegState::Kill)
.addReg(PS->PoisonReg)
@@ -2611,7 +2614,7 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
// For all of these, the target register is the first operand of the
// instruction.
auto &TargetOp = MI.getOperand(0);
- unsigned OldTargetReg = TargetOp.getReg();
+ Register OldTargetReg = TargetOp.getReg();
// Try to lookup a hardened version of this register. We retain a reference
// here as we want to update the map to track any newly computed hardened
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index d5bb56603df9..f8f78da52cc2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -146,6 +146,9 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
return X86II::MO_DLLIMPORT;
return X86II::MO_COFFSTUB;
}
+ // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables.
+ if (isOSWindows())
+ return X86II::MO_NO_FLAG;
if (is64Bit()) {
// ELF supports a large, truly PIC code model with non-PC relative GOT
@@ -285,10 +288,10 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD and Solaris (both
// 32 and 64 bit) and for all 64-bit targets.
if (StackAlignOverride)
- stackAlignment = StackAlignOverride;
+ stackAlignment = *StackAlignOverride;
else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
isTargetKFreeBSD() || In64BitMode)
- stackAlignment = 16;
+ stackAlignment = Align(16);
// Some CPUs have more overhead for gather. The specified overhead is relative
// to the Load operation. "2" is the number provided by Intel architects. This
@@ -304,6 +307,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// Consume the vector width attribute or apply any target specific limit.
if (PreferVectorWidthOverride)
PreferVectorWidth = PreferVectorWidthOverride;
+ else if (Prefer128Bit)
+ PreferVectorWidth = 128;
else if (Prefer256Bit)
PreferVectorWidth = 256;
}
@@ -316,12 +321,11 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM,
- unsigned StackAlignOverride,
+ MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth)
- : X86GenSubtargetInfo(TT, CPU, FS),
- PICStyle(PICStyles::None), TM(TM), TargetTriple(TT),
- StackAlignOverride(StackAlignOverride),
+ : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM),
+ TargetTriple(TT), StackAlignOverride(StackAlignOverride),
PreferVectorWidthOverride(PreferVectorWidthOverride),
RequiredVectorWidth(RequiredVectorWidth),
In64BitMode(TargetTriple.getArch() == Triple::x86_64),
@@ -355,7 +359,7 @@ const CallLowering *X86Subtarget::getCallLowering() const {
return CallLoweringInfo.get();
}
-const InstructionSelector *X86Subtarget::getInstructionSelector() const {
+InstructionSelector *X86Subtarget::getInstructionSelector() const {
return InstSelector.get();
}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 24ccc9cb7843..e8efe8f2afe5 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -365,8 +365,8 @@ protected:
/// Processor has AVX-512 vp2intersect instructions
bool HasVP2INTERSECT = false;
- /// Processor supports MPX - Memory Protection Extensions
- bool HasMPX = false;
+ /// Deprecated flag for MPX instructions.
+ bool DeprecatedHasMPX = false;
/// Processor supports CET SHSTK - Control-Flow Enforcement Technology
/// using Shadow Stack
@@ -427,15 +427,21 @@ protected:
/// Use software floating point for code generation.
bool UseSoftFloat = false;
+ /// Use alias analysis during code generation.
+ bool UseAA = false;
+
/// The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
- unsigned stackAlignment = 4;
+ Align stackAlignment = Align(4);
/// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
///
// FIXME: this is a known good value for Yonah. How about others?
unsigned MaxInlineSizeThreshold = 128;
+ /// Indicates target prefers 128 bit instructions.
+ bool Prefer128Bit = false;
+
/// Indicates target prefers 256 bit instructions.
bool Prefer256Bit = false;
@@ -453,7 +459,7 @@ protected:
private:
/// Override the stack alignment.
- unsigned StackAlignOverride;
+ MaybeAlign StackAlignOverride;
/// Preferred vector width from function attribute.
unsigned PreferVectorWidthOverride;
@@ -490,7 +496,7 @@ public:
/// of the specified triple.
///
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
- const X86TargetMachine &TM, unsigned StackAlignOverride,
+ const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth);
@@ -515,7 +521,7 @@ public:
/// Returns the minimum alignment known to hold of the
/// stack frame on entry to the function and which must be maintained by every
/// function for this subtarget.
- unsigned getStackAlignment() const { return stackAlignment; }
+ Align getStackAlignment() const { return stackAlignment; }
/// Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -527,7 +533,7 @@ public:
/// Methods used by Global ISel
const CallLowering *getCallLowering() const override;
- const InstructionSelector *getInstructionSelector() const override;
+ InstructionSelector *getInstructionSelector() const override;
const LegalizerInfo *getLegalizerInfo() const override;
const RegisterBankInfo *getRegBankInfo() const override;
@@ -684,7 +690,6 @@ public:
bool hasBF16() const { return HasBF16; }
bool hasVP2INTERSECT() const { return HasVP2INTERSECT; }
bool hasBITALG() const { return HasBITALG; }
- bool hasMPX() const { return HasMPX; }
bool hasSHSTK() const { return HasSHSTK; }
bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
bool hasCLWB() const { return HasCLWB; }
@@ -739,6 +744,7 @@ public:
X86ProcFamily == IntelTRM;
}
bool useSoftFloat() const { return UseSoftFloat; }
+ bool useAA() const override { return UseAA; }
/// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
/// no-sse2). There isn't any reason to disable it if the target processor
@@ -809,6 +815,7 @@ public:
// On Win64, all these conventions just use the default convention.
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Tail:
case CallingConv::Swift:
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 0cbf13899a29..c15297134e4d 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -81,27 +81,28 @@ extern "C" void LLVMInitializeX86Target() {
initializeX86SpeculativeLoadHardeningPassPass(PR);
initializeX86FlagsCopyLoweringPassPass(PR);
initializeX86CondBrFoldingPassPass(PR);
+ initializeX86OptimizeLEAPassPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO()) {
if (TT.getArch() == Triple::x86_64)
- return llvm::make_unique<X86_64MachoTargetObjectFile>();
- return llvm::make_unique<TargetLoweringObjectFileMachO>();
+ return std::make_unique<X86_64MachoTargetObjectFile>();
+ return std::make_unique<TargetLoweringObjectFileMachO>();
}
if (TT.isOSFreeBSD())
- return llvm::make_unique<X86FreeBSDTargetObjectFile>();
+ return std::make_unique<X86FreeBSDTargetObjectFile>();
if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
- return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+ return std::make_unique<X86LinuxNaClTargetObjectFile>();
if (TT.isOSSolaris())
- return llvm::make_unique<X86SolarisTargetObjectFile>();
+ return std::make_unique<X86SolarisTargetObjectFile>();
if (TT.isOSFuchsia())
- return llvm::make_unique<X86FuchsiaTargetObjectFile>();
+ return std::make_unique<X86FuchsiaTargetObjectFile>();
if (TT.isOSBinFormatELF())
- return llvm::make_unique<X86ELFTargetObjectFile>();
+ return std::make_unique<X86ELFTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
- return llvm::make_unique<TargetLoweringObjectFileCOFF>();
+ return std::make_unique<TargetLoweringObjectFileCOFF>();
llvm_unreachable("unknown subtarget type");
}
@@ -116,6 +117,9 @@ static std::string computeDataLayout(const Triple &TT) {
!TT.isArch64Bit())
Ret += "-p:32:32";
+ // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers.
+ Ret += "-p270:32:32-p271:32:32-p272:64:64";
+
// Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
Ret += "-i64:64";
@@ -218,17 +222,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
OL),
TLOF(createTLOF(getTargetTriple())) {
- // Windows stack unwinder gets confused when execution flow "falls through"
- // after a call to 'noreturn' function.
- // To prevent that, we emit a trap for 'unreachable' IR instructions.
- // (which on X86, happens to be the 'ud2' instruction)
// On PS4, the "return address" of a 'noreturn' call must still be within
// the calling function, and TrapUnreachable is an easy way to get that.
- // The check here for 64-bit windows is a bit icky, but as we're unlikely
- // to ever want to mix 32 and 64-bit windows code in a single module
- // this should be fine.
- if ((TT.isOSWindows() && TT.getArch() == Triple::x86_64) || TT.isPS4() ||
- TT.isOSBinFormatMachO()) {
+ if (TT.isPS4() || TT.isOSBinFormatMachO()) {
this->Options.TrapUnreachable = true;
this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
}
@@ -311,10 +307,10 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
- Options.StackAlignmentOverride,
- PreferVectorWidthOverride,
- RequiredVectorWidth);
+ I = std::make_unique<X86Subtarget>(
+ TargetTriple, CPU, FS, *this,
+ MaybeAlign(Options.StackAlignmentOverride), PreferVectorWidthOverride,
+ RequiredVectorWidth);
}
return I.get();
}
@@ -517,12 +513,19 @@ void X86PassConfig::addPreEmitPass() {
}
void X86PassConfig::addPreEmitPass2() {
+ const Triple &TT = TM->getTargetTriple();
+ const MCAsmInfo *MAI = TM->getMCAsmInfo();
+
addPass(createX86RetpolineThunksPass());
+
+ // Insert extra int3 instructions after trailing call instructions to avoid
+ // issues in the unwinder.
+ if (TT.isOSWindows() && TT.getArch() == Triple::x86_64)
+ addPass(createX86AvoidTrailingCallPass());
+
// Verify basic block incoming and outgoing cfa offset and register values and
// correct CFA calculation rule where needed by inserting appropriate CFI
// instructions.
- const Triple &TT = TM->getTargetTriple();
- const MCAsmInfo *MAI = TM->getMCAsmInfo();
if (!TT.isOSDarwin() &&
(!TT.isOSWindows() ||
MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index b999e2e86af6..ec3db7b1e9e8 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -16,7 +16,6 @@
#include "X86Subtarget.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringMap.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Target/TargetMachine.h"
#include <memory>
@@ -26,6 +25,7 @@ namespace llvm {
class StringRef;
class X86Subtarget;
class X86RegisterBankInfo;
+class TargetTransformInfo;
class X86TargetMachine final : public LLVMTargetMachine {
std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 92e0779c2e74..44185957686b 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -47,8 +47,8 @@ MCSymbol *X86_64MachoTargetObjectFile::getCFIPersonalitySymbol(
}
const MCExpr *X86_64MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
- const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
- MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+ const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+ int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
// On Darwin/X86-64, we need to use foo@GOTPCREL+4 to access the got entry
// from a data section. In case there's an additional offset, then use
// foo@GOTPCREL+4+<offset>.
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 13d7b4ad70d6..1fd0bbf56b19 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -30,7 +30,8 @@ namespace llvm {
const TargetMachine &TM,
MachineModuleInfo *MMI) const override;
- const MCExpr *getIndirectSymViaGOTPCRel(const MCSymbol *Sym,
+ const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
+ const MCSymbol *Sym,
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 3dc59aeb263e..70fd857fcf01 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
-unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector && !ST->hasSSE1())
return 0;
@@ -887,7 +888,7 @@ int X86TTIImpl::getArithmeticInstrCost(
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
- // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
+ // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
@@ -911,6 +912,39 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;
+ // Handle some cases for widening legalization. For now we only handle
+ // cases where the original subvector was naturally aligned and evenly
+ // fit in its legalized subvector type.
+ // FIXME: Remove some of the alignment restrictions.
+ // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
+ // vectors.
+ int OrigSubElts = SubTp->getVectorNumElements();
+ if (NumSubElts > OrigSubElts &&
+ (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
+ LT.second.getVectorElementType() ==
+ SubLT.second.getVectorElementType() &&
+ LT.second.getVectorElementType().getSizeInBits() ==
+ Tp->getVectorElementType()->getPrimitiveSizeInBits()) {
+ assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
+ "Unexpected number of elements!");
+ Type *VecTy = VectorType::get(Tp->getVectorElementType(),
+ LT.second.getVectorNumElements());
+ Type *SubTy = VectorType::get(Tp->getVectorElementType(),
+ SubLT.second.getVectorNumElements());
+ int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
+ int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
+ ExtractIndex, SubTy);
+
+ // If the original size is 32-bits or more, we can use pshufd. Otherwise
+ // if we have SSSE3 we can use pshufb.
+ if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
+ return ExtractCost + 1; // pshufd or pshufb
+
+ assert(SubTp->getPrimitiveSizeInBits() == 16 &&
+ "Unexpected vector size");
+
+ return ExtractCost + 2; // worst case pshufhw + pshufd
+ }
}
}
@@ -1314,8 +1348,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
@@ -1354,6 +1390,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
@@ -1371,14 +1409,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
@@ -1402,13 +1440,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
@@ -1421,7 +1459,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
@@ -1507,6 +1548,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
};
@@ -1520,7 +1562,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
@@ -1536,6 +1579,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
@@ -1562,15 +1607,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
+ { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
};
std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
@@ -1691,6 +1742,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
}
}
+ static const CostTblEntry SLMCostTbl[] = {
+ // slm pcmpeq/pcmpgt throughput is 2
+ { ISD::SETCC, MVT::v2i64, 2 },
+ };
+
static const CostTblEntry AVX512BWCostTbl[] = {
{ ISD::SETCC, MVT::v32i16, 1 },
{ ISD::SETCC, MVT::v64i8, 1 },
@@ -1777,6 +1833,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SELECT, MVT::v4f32, 3 }, // andps + andnps + orps
};
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+ return LT.first * (ExtraCost + Entry->Cost);
+
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
return LT.first * (ExtraCost + Entry->Cost);
@@ -2043,8 +2103,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
+ static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
+ { ISD::CTLZ, MVT::i64, 1 },
+ };
+ static const CostTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTLZ, MVT::i32, 1 },
+ { ISD::CTLZ, MVT::i16, 1 },
+ { ISD::CTLZ, MVT::i8, 1 },
+ };
+ static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
+ { ISD::CTPOP, MVT::i64, 1 },
+ };
+ static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTPOP, MVT::i32, 1 },
+ { ISD::CTPOP, MVT::i16, 1 },
+ { ISD::CTPOP, MVT::i8, 1 },
+ };
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
};
@@ -2052,6 +2130,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTPOP, MVT::i32, 8 },
+ { ISD::CTPOP, MVT::i16, 9 },
+ { ISD::CTPOP, MVT::i8, 7 },
{ ISD::SADDO, MVT::i32, 1 },
{ ISD::SADDO, MVT::i16, 1 },
{ ISD::SADDO, MVT::i8, 1 },
@@ -2163,6 +2247,26 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
+ if (ST->hasLZCNT()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ if (ST->hasPOPCNT()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ // TODO - add BMI (TZCNT) scalar handling
+
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -2357,8 +2461,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
unsigned NumElem = SrcVTy->getVectorNumElements();
VectorType *MaskTy =
VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
- if ((IsLoad && !isLegalMaskedLoad(SrcVTy)) ||
- (IsStore && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) {
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
+ !isPowerOf2_32(NumElem)) {
// Scalarization
int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
@@ -2425,70 +2530,107 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
bool IsPairwise) {
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
-
- MVT MTy = LT.second;
-
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry SSE42CostTblPairWise[] = {
+ static const CostTblEntry SSE2CostTblPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
+ { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
+ { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
{ ISD::ADD, MVT::v8i16, 5 },
+ { ISD::ADD, MVT::v2i8, 2 },
+ { ISD::ADD, MVT::v4i8, 2 },
+ { ISD::ADD, MVT::v8i8, 2 },
+ { ISD::ADD, MVT::v16i8, 3 },
};
static const CostTblEntry AVX1CostTblPairWise[] = {
- { ISD::FADD, MVT::v4f32, 4 },
{ ISD::FADD, MVT::v4f64, 5 },
{ ISD::FADD, MVT::v8f32, 7 },
{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
- { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
{ ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
- { ISD::ADD, MVT::v8i16, 5 },
{ ISD::ADD, MVT::v8i32, 5 },
+ { ISD::ADD, MVT::v16i16, 6 },
+ { ISD::ADD, MVT::v32i8, 4 },
};
- static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
+ { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
{ ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3".
+ { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3".
{ ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3".
+ { ISD::ADD, MVT::v2i8, 2 },
+ { ISD::ADD, MVT::v4i8, 2 },
+ { ISD::ADD, MVT::v8i8, 2 },
+ { ISD::ADD, MVT::v16i8, 3 },
};
static const CostTblEntry AVX1CostTblNoPairWise[] = {
- { ISD::FADD, MVT::v4f32, 3 },
{ ISD::FADD, MVT::v4f64, 3 },
+ { ISD::FADD, MVT::v4f32, 3 },
{ ISD::FADD, MVT::v8f32, 4 },
{ ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
- { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8".
{ ISD::ADD, MVT::v4i64, 3 },
- { ISD::ADD, MVT::v8i16, 4 },
{ ISD::ADD, MVT::v8i32, 5 },
+ { ISD::ADD, MVT::v16i16, 5 },
+ { ISD::ADD, MVT::v32i8, 4 },
};
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (IsPairwise) {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+ } else {
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+ }
+ }
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
if (IsPairwise) {
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
} else {
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
}
@@ -3116,7 +3258,7 @@ bool X86TTIImpl::canMacroFuseCmp() {
return ST->hasMacroFusion() || ST->hasBranchFusion();
}
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
if (!ST->hasAVX())
return false;
@@ -3139,11 +3281,11 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
}
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType) {
- return isLegalMaskedLoad(DataType);
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+ return isLegalMaskedLoad(DataType, Alignment);
}
-bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
+bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) {
unsigned DataSize = DL.getTypeStoreSize(DataType);
// The only supported nontemporal loads are for aligned vectors of 16 or 32
// bytes. Note that 32-byte nontemporal vector loads are supported by AVX2
@@ -3154,7 +3296,7 @@ bool X86TTIImpl::isLegalNTLoad(Type *DataType, unsigned Alignment) {
return false;
}
-bool X86TTIImpl::isLegalNTStore(Type *DataType, unsigned Alignment) {
+bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) {
unsigned DataSize = DL.getTypeStoreSize(DataType);
// SSE4A supports nontemporal stores of float and double at arbitrary
@@ -3299,9 +3441,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
if (IsZeroCmp) {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
- // TODO: enable AVX512 when the DAG is ready.
- // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
const unsigned PreferredWidth = ST->getPreferVectorWidth();
+ if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
// All GPR and vector loads can be unaligned. SIMD compare requires integer
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 25d9c33eb16d..7581257f41f8 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -83,6 +83,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureSlowUAMem32,
// Based on whether user set the -mprefer-vector-width command line.
+ X86::FeaturePrefer128Bit,
X86::FeaturePrefer256Bit,
// CPU name enums. These just follow CPU string.
@@ -115,7 +116,7 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(unsigned VF);
@@ -184,10 +185,10 @@ public:
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
- bool isLegalMaskedLoad(Type *DataType);
- bool isLegalMaskedStore(Type *DataType);
- bool isLegalNTLoad(Type *DataType, unsigned Alignment);
- bool isLegalNTStore(Type *DataType, unsigned Alignment);
+ bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment);
+ bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment);
+ bool isLegalNTLoad(Type *DataType, Align Alignment);
+ bool isLegalNTStore(Type *DataType, Align Alignment);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index a07d2f20acab..9280d030b5d5 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -292,8 +292,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
// need to insert any VZEROUPPER instructions. This is constant-time, so it
// is cheap in the common case of no ymm/zmm use.
bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
- const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
- for (auto *RC : RCs) {
+ for (auto *RC : {&X86::VR256RegClass, &X86::VR512_0_15RegClass}) {
if (!YmmOrZmmUsed) {
for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
i++) {
@@ -304,9 +303,8 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
}
}
}
- if (!YmmOrZmmUsed) {
+ if (!YmmOrZmmUsed)
return false;
- }
assert(BlockStates.empty() && DirtySuccessors.empty() &&
"X86VZeroUpper state should be clear");
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index 9e499db1d7ee..ae72c6427588 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -81,7 +81,7 @@ static int64_t getWinAllocaAmount(MachineInstr *MI, MachineRegisterInfo *MRI) {
MI->getOpcode() == X86::WIN_ALLOCA_64);
assert(MI->getOperand(0).isReg());
- unsigned AmountReg = MI->getOperand(0).getReg();
+ Register AmountReg = MI->getOperand(0).getReg();
MachineInstr *Def = MRI->getUniqueVRegDef(AmountReg);
if (!Def ||
@@ -261,7 +261,7 @@ void X86WinAllocaExpander::lower(MachineInstr* MI, Lowering L) {
break;
}
- unsigned AmountReg = MI->getOperand(0).getReg();
+ Register AmountReg = MI->getOperand(0).getReg();
MI->eraseFromParent();
// Delete the definition of AmountReg.
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index f68d17d7256d..d65e1f3ab414 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -339,7 +339,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
if (UseStackGuard) {
Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
Value *FrameAddr = Builder.CreateCall(
- Intrinsic::getDeclaration(TheModule, Intrinsic::frameaddress),
+ Intrinsic::getDeclaration(
+ TheModule, Intrinsic::frameaddress,
+ Builder.getInt8PtrTy(
+ TheModule->getDataLayout().getAllocaAddrSpace())),
Builder.getInt32(0), "frameaddr");
Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);