summaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp8
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp17
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp37
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp28
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp190
-rw-r--r--lib/Target/X86/X86ISelLowering.h26
-rw-r--r--lib/Target/X86/X86InstrAVX512.td26
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td24
-rw-r--r--lib/Target/X86/X86InstrSSE.td2
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp300
-rw-r--r--lib/Target/X86/X86InterleavedAccess.cpp125
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h24
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp32
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp101
-rw-r--r--lib/Target/X86/X86MacroFusion.h11
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp6
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h5
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp129
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h7
22 files changed, 831 insertions, 275 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e5d3209ec6a97..d30cc724c203f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1705,8 +1705,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
unsigned Len = DotDispStr.size();
- unsigned Val = OrigDispVal + DotDispVal;
- InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+ InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal);
}
NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 7a9e4f4468ec7..914fb36f91a7d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -108,12 +108,12 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsPCRel) const override {
unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
- assert(Fixup.getOffset() + Size <= DataSize &&
- "Invalid fixup offset!");
+ assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
// Check that uppper bits are either all zeros or all ones.
// Specifically ignore overflow/underflow as long as the leakage is
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 4097ef224d503..caf98bffb80de 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -153,8 +153,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
const MCSymbol *B_Base = Asm.getAtom(*B);
// Neither symbol can be modified.
- if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
- Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+ if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
Asm.getContext().reportError(Fixup.getLoc(),
"unsupported relocation of modified symbol");
return;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 105580c913a16..5892f1de33eec 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
#include "MCTargetDesc/X86FixupKinds.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCValue.h"
@@ -25,8 +26,8 @@ public:
X86WinCOFFObjectWriter(bool Is64Bit);
~X86WinCOFFObjectWriter() override = default;
- unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
- bool IsCrossSection,
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsCrossSection,
const MCAsmBackend &MAB) const override;
};
@@ -36,11 +37,19 @@ X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
: MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
: COFF::IMAGE_FILE_MACHINE_I386) {}
-unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+ const MCValue &Target,
const MCFixup &Fixup,
bool IsCrossSection,
const MCAsmBackend &MAB) const {
- unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+ unsigned FixupKind = Fixup.getKind();
+ if (IsCrossSection) {
+ if (FixupKind != FK_Data_4) {
+ Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+ return COFF::IMAGE_REL_AMD64_ADDR32;
+ }
+ FixupKind = FK_PCRel_4;
+ }
MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 2777fa89330f6..e3aa227702bea 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -748,17 +748,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
else
CallOp = X86::CALLpcrel32;
- const char *Symbol;
- if (Is64Bit) {
- if (STI.isTargetCygMing()) {
- Symbol = "___chkstk_ms";
- } else {
- Symbol = "__chkstk";
- }
- } else if (STI.isTargetCygMing())
- Symbol = "_alloca";
- else
- Symbol = "_chkstk";
+ StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
MachineInstrBuilder CI;
MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
@@ -769,10 +759,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
// For the large code model, we have to call through a register. Use R11,
// as it is scratch in all supported calling conventions.
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
- .addExternalSymbol(Symbol);
+ .addExternalSymbol(MF.createExternalSymbolName(Symbol));
CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
} else {
- CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+ CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
+ .addExternalSymbol(MF.createExternalSymbolName(Symbol));
}
unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
@@ -783,13 +774,16 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
.addReg(SP, RegState::Define | RegState::Implicit)
.addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- if (Is64Bit) {
+ if (STI.isTargetWin64() || !STI.isOSWindows()) {
+ // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
// MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
- // themselves. It also does not clobber %rax so we can reuse it when
+ // themselves. They also does not clobber %rax so we can reuse it when
// adjusting %rsp.
- BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
+ // All other platforms do not specify a particular ABI for the stack probe
+ // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
+ BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+ .addReg(SP)
+ .addReg(AX);
}
if (InProlog) {
@@ -978,7 +972,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->setCalleeSavedFrameSize(
X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
- bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+ bool UseRedZone = false;
+ bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
// The default stack probe size is 4096 if the function has no stackprobesize
// attribute.
@@ -1007,6 +1002,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
!TRI->needsStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
+ !UseStackProbe && // No stack probes.
!IsWin64CC && // Win64 has no Red Zone
!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
!MF.shouldSplitStack()) { // Regular stack
@@ -1015,6 +1011,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
MFI.setStackSize(StackSize);
+ UseRedZone = true;
}
// Insert stack pointer adjustment for later moving of return addr. Only
@@ -1192,6 +1189,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+ assert(!UseRedZone && "The Red Zone is not accounted for in stack probes");
+
// Check whether EAX is livein for this block.
bool isEAXAlive = isEAXLiveIn(MBB);
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2a1633de0a239..3c4589ab18f6f 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -204,6 +204,11 @@ namespace {
bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
+ template <class GatherScatterSDNode>
+ bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
bool selectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -1415,13 +1420,10 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
return false;
}
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
- SDValue &Scale, SDValue &Index,
- SDValue &Disp, SDValue &Segment) {
-
- MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
- if (!Mgs)
- return false;
+template <class GatherScatterSDNode>
+bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
+ GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment) {
X86ISelAddressMode AM;
unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1453,6 +1455,18 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
return true;
}
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent))
+ return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>(
+ Mgs, N, Base, Scale, Index, Disp, Segment);
+ if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent))
+ return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>(
+ X86Gather, N, Base, Scale, Index, Disp, Segment);
+ return false;
+}
+
/// Returns true if it is able to pattern match an addressing mode.
/// It returns the operands which make up the maximal addressing mode it can
/// match by reference.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 172eba0002d4f..f777e56289884 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1662,6 +1662,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
MaxStoresPerMemmoveOptSize = 4;
+
+ // TODO: These control memcmp expansion in CGP and are set low to prevent
+ // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder.
+ MaxLoadsPerMemcmp = 1;
+ MaxLoadsPerMemcmpOptSize = 1;
+
// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
@@ -14272,9 +14278,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// If we are inserting a element, see if we can do this more efficiently with
// a blend shuffle with a rematerializable vector than a costly integer
// insertion.
- // TODO: pre-SSE41 targets will tend to use bit masking - this could still
- // be beneficial if we are inserting several zeros and can combine the masks.
- if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+ if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+ 16 <= EltVT.getSizeInBits()) {
SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
@@ -17621,23 +17626,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
SDValue CmpOp0 = Cmp.getOperand(0);
+
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
// (select (x == 0), 0, -1) -> neg & sbb
if (isNullConstant(Y) &&
- (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
- SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
- SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
- DAG.getConstant(0, DL,
- CmpOp0.getValueType()),
- CmpOp0);
- SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
- DAG.getConstant(X86::COND_B, DL, MVT::i8),
- SDValue(Neg.getNode(), 1));
- return Res;
- }
+ (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+ SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+ SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
+ SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ return Res;
+ }
Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
@@ -18648,8 +18651,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
+ bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
- SplitStack;
+ SplitStack || EmitStackProbe;
SDLoc dl(Op);
// Get the inputs.
@@ -23705,6 +23709,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue RetOps[] = {Exract, NewGather.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
+ if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+ // There is a special case when the return type is v2i32 is illegal and
+ // the type legaizer extended it to v2i64. Without this conversion we end up
+ // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
+ // In order to avoid this situation, we'll build an X86 specific Gather node
+ // with index v2i64 and value type v4i32.
+ assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
+ "Unexpected type in masked gather");
+ Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
+ DAG.getBitcast(MVT::v4i32, Src0),
+ DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+ // The mask should match the destination type. Extending mask with zeroes
+ // is not necessary since instruction itself reads only two values from
+ // memory.
+ Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+
+ SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
+ NewGather.getValue(0), DAG);
+ SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
+ // This transformation is for optimization only.
+ // The type legalizer extended mask and index to 4 elements vector
+ // in order to match requirements of the common gather node - same
+ // vector width of index and value. X86 Gather node allows mismatch
+ // of vector width in order to select more optimal instruction at the
+ // end.
+ assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
+ "Unexpected type in masked gather");
+ if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
+ ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
+ Index.getOpcode() == ISD::CONCAT_VECTORS &&
+ Index.getOperand(1).isUndef()) {
+ Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
+ Index = Index.getOperand(0);
+ } else
+ return Op;
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+
+ SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
+ return DAG.getMergeValues(RetOps, dl);
+
+ }
return Op;
}
@@ -24508,6 +24563,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
+ case X86ISD::MGATHER: return "X86ISD::MGATHER";
}
return nullptr;
}
@@ -29868,7 +29924,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
if (N->getValueType(0) == MVT::i32)
Diff = (unsigned)Diff;
- bool isFastMultiplier = false;
+ bool IsFastMultiplier = false;
if (Diff < 10) {
switch ((unsigned char)Diff) {
default:
@@ -29880,12 +29936,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
case 5: // result = lea base(cond, cond*4)
case 8: // result = lea base( , cond*8)
case 9: // result = lea base(cond, cond*8)
- isFastMultiplier = true;
+ IsFastMultiplier = true;
break;
}
}
- if (isFastMultiplier) {
+ if (IsFastMultiplier) {
APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
if (NeedsCondInvert) // Invert the condition if needed.
Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
@@ -34841,23 +34897,56 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
!Cmp.getOperand(0).getValueType().isInteger())
return SDValue();
- // (cmp Z, 1) sets the carry flag if Z is 0.
SDValue Z = Cmp.getOperand(0);
- SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
- DAG.getConstant(1, DL, Z.getValueType()));
+ EVT ZVT = Z.getValueType();
+
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) {
+ // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+ // fake operands:
+ // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+ // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+ if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+ SDValue Zero = DAG.getConstant(0, DL, ZVT);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1));
+ }
+
+ // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+ // with fake operands:
+ // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+ // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+ if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+ (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ }
+ }
- SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ // (cmp Z, 1) sets the carry flag if Z is 0.
+ SDValue One = DAG.getConstant(1, DL, ZVT);
+ SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+
+ // Add the flags type for ADC/SBB nodes.
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
- DAG.getConstant(-1ULL, DL, VT), NewCmp);
+ DAG.getConstant(-1ULL, DL, VT), Cmp1);
// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
- DAG.getConstant(0, DL, VT), NewCmp);
+ DAG.getConstant(0, DL, VT), Cmp1);
}
static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
@@ -34976,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
}
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
+ "Unexpected opcode for increment/decrement transform");
+
+ // Pseudo-legality check: getOnesVector() expects one of these types, so bail
+ // out and wait for legalization if we have an unsupported vector length.
+ EVT VT = N->getValueType(0);
+ if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+ return SDValue();
+
+ SDNode *N1 = N->getOperand(1).getNode();
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
+ return SDValue();
+
+ SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
+ unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
@@ -34995,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
+ if (SDValue V = combineIncDecVector(N, DAG))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -35028,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, false))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
+ if (SDValue V = combineIncDecVector(N, DAG))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -36335,3 +36456,22 @@ void X86TargetLowering::insertCopiesSplitCSR(
bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
}
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+ // If the function specifically requests stack probes, emit them.
+ if (MF.getFunction()->hasFnAttribute("probe-stack"))
+ return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+
+ // Generally, if we aren't on Windows, the platform ABI does not include
+ // support for stack probes, so don't emit them.
+ if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+ return "";
+
+ // We need a stack probe to conform to the Windows ABI. Choose the right
+ // symbol.
+ if (Subtarget.is64Bit())
+ return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+ return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index f51b6641db2fb..e1ade92979dc0 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -615,7 +615,10 @@ namespace llvm {
// Vector truncating store with unsigned/signed saturation
VTRUNCSTOREUS, VTRUNCSTORES,
// Vector truncating masked store with unsigned/signed saturation
- VMTRUNCSTOREUS, VMTRUNCSTORES
+ VMTRUNCSTOREUS, VMTRUNCSTORES,
+
+ // X86 specific gather
+ MGATHER
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -1056,6 +1059,8 @@ namespace llvm {
bool supportSwiftError() const override;
+ StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
/// \brief Lower interleaved load(s) into target specific
@@ -1065,6 +1070,12 @@ namespace llvm {
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
+ /// \brief Lower interleaved store(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ unsigned Factor) const override;
+
+
void finalizeLowering(MachineFunction &MF) const override;
protected:
@@ -1397,6 +1408,19 @@ namespace llvm {
}
};
+ // X86 specific Gather node.
+ class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode {
+ public:
+ X86MaskedGatherSDNode(unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO)
+ {}
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER;
+ }
+ };
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2620679df2517..01a70323224c3 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -7265,13 +7265,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
let Predicates = [HasAVX512] in {
def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
(_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
@@ -7281,13 +7281,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x1))), _.FRC)>;
+ addr:$src, (i32 0x9))), _.FRC)>;
def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x2))), _.FRC)>;
+ addr:$src, (i32 0xa))), _.FRC)>;
def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x3))), _.FRC)>;
+ addr:$src, (i32 0xb))), _.FRC)>;
def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
(_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
addr:$src, (i32 0x4))), _.FRC)>;
@@ -7869,7 +7869,7 @@ let Predicates = [HasVLX] in {
defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
vx128xmem, mgatherv4i32>, EVEX_V128;
defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
- vx64xmem, mgatherv2i64>, EVEX_V128;
+ vx64xmem, X86mgatherv2i64>, EVEX_V128;
}
}
@@ -8471,26 +8471,26 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
}
let Predicates = [HasAVX512] in {
def : Pat<(v16f32 (ffloor VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+ (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
def : Pat<(v16f32 (fnearbyint VR512:$src)),
(VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
def : Pat<(v16f32 (fceil VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
def : Pat<(v16f32 (frint VR512:$src)),
(VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
def : Pat<(v16f32 (ftrunc VR512:$src)),
- (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+ (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
def : Pat<(v8f64 (ffloor VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+ (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
def : Pat<(v8f64 (fnearbyint VR512:$src)),
(VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
def : Pat<(v8f64 (fceil VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
def : Pat<(v8f64 (frint VR512:$src)),
(VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
def : Pat<(v8f64 (ftrunc VR512:$src)),
- (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+ (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
}
defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index c28b35b22977a..8b5bbf24f6f63 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -641,7 +641,7 @@ def sdmem : Operand<v2f64> {
// SSE pattern fragments
//===----------------------------------------------------------------------===//
-// Vector load wrappers to prevent folding of non-temporal aligned loads on
+// Vector load wrappers to prevent folding of non-temporal aligned loads on
// supporting targets.
def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() ||
@@ -754,16 +754,6 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
-// These are needed to match a scalar memop that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def memopfsf32_128 : PatFrag<(ops node:$ptr),
- (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
-def memopfsf64_128 : PatFrag<(ops node:$ptr),
- (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
-
-
// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
// 16-byte boundary.
// FIXME: 8 byte alignment for mmx reads is not required
@@ -773,6 +763,9 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
+def X86masked_gather : SDNode<"X86ISD::MGATHER", SDTMaskedGather,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_gather node:$src1, node:$src2, node:$src3) , [{
if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -796,6 +789,15 @@ def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
Mgt->getBasePtr().getValueType() == MVT::v2i64);
return false;
}]>;
+def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
+ return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+ Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
+ (Mgt->getMemoryVT() == MVT::v2i32 ||
+ Mgt->getMemoryVT() == MVT::v2f32);
+ return false;
+}]>;
def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_gather node:$src1, node:$src2, node:$src3) , [{
if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 8490b972eb5c1..fe87bbd994738 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1744,7 +1744,7 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
}
-def : Pat<(f32 (fpround FR64:$src)),
+def : Pat<(f32 (fpround FR64:$src)),
(VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
Requires<[UseAVX]>;
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 77dead8d24137..f98c2a7e802dd 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -72,9 +72,24 @@ private:
MachineFunction &MF) const;
bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
-
bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+ bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+
+ // emit insert subreg instruction and insert it before MachineInstr &I
+ bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+ MachineRegisterInfo &MRI, MachineFunction &MF) const;
+ // emit extract subreg instruction and insert it before MachineInstr &I
+ bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+ MachineRegisterInfo &MRI, MachineFunction &MF) const;
+
+ const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const;
+ const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg,
+ MachineRegisterInfo &MRI) const;
const X86TargetMachine &TM;
const X86Subtarget &STI;
@@ -113,8 +128,8 @@ X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
// FIXME: This should be target-independent, inferred from the types declared
// for each class in the bank.
-static const TargetRegisterClass *
-getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
if (RB.getID() == X86::GPRRegBankID) {
if (Ty.getSizeInBits() <= 8)
return &X86::GR8RegClass;
@@ -127,13 +142,13 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
}
if (RB.getID() == X86::VECRRegBankID) {
if (Ty.getSizeInBits() == 32)
- return &X86::FR32XRegClass;
+ return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
if (Ty.getSizeInBits() == 64)
- return &X86::FR64XRegClass;
+ return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
if (Ty.getSizeInBits() == 128)
- return &X86::VR128XRegClass;
+ return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass;
if (Ty.getSizeInBits() == 256)
- return &X86::VR256XRegClass;
+ return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass;
if (Ty.getSizeInBits() == 512)
return &X86::VR512RegClass;
}
@@ -141,10 +156,16 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
llvm_unreachable("Unknown RegBank!");
}
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
+ MachineRegisterInfo &MRI) const {
+ const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI);
+ return getRegClass(Ty, RegBank);
+}
+
// Set X86 Opcode and constrain DestReg.
-static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
- MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
- const RegisterBankInfo &RBI) {
+bool X86InstructionSelector::selectCopy(MachineInstr &I,
+ MachineRegisterInfo &MRI) const {
unsigned DstReg = I.getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
@@ -171,7 +192,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
switch (RegBank.getID()) {
case X86::GPRRegBankID:
assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
- RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+ RC = getRegClass(MRI.getType(DstReg), RegBank);
// Change the physical register
if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
@@ -186,7 +207,7 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
}
break;
case X86::VECRRegBankID:
- RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+ RC = getRegClass(MRI.getType(DstReg), RegBank);
break;
default:
llvm_unreachable("Unknown RegBank!");
@@ -220,7 +241,7 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
// Certain non-generic instructions also need some special handling.
if (I.isCopy())
- return selectCopy(I, TII, MRI, TRI, RBI);
+ return selectCopy(I, MRI);
// TODO: handle more cases - LOAD_STACK_GUARD, PHI
return true;
@@ -249,6 +270,10 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
return true;
if (selectUadde(I, MRI, MF))
return true;
+ if (selectExtract(I, MRI, MF))
+ return true;
+ if (selectInsert(I, MRI, MF))
+ return true;
return false;
}
@@ -326,6 +351,34 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
return Opc;
}
+// Fill in an address from the given instruction.
+void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
+ X86AddressMode &AM) {
+
+ assert(I.getOperand(0).isReg() && "unsupported opperand.");
+ assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
+ "unsupported type.");
+
+ if (I.getOpcode() == TargetOpcode::G_GEP) {
+ if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+ int64_t Imm = *COff;
+ if (isInt<32>(Imm)) { // Check for displacement overflow.
+ AM.Disp = static_cast<int32_t>(Imm);
+ AM.Base.Reg = I.getOperand(1).getReg();
+ return;
+ }
+ }
+ } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+ AM.Base.FrameIndex = I.getOperand(1).getIndex();
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ return;
+ }
+
+ // Default behavior.
+ AM.Base.Reg = I.getOperand(0).getReg();
+ return;
+}
+
bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
@@ -340,18 +393,28 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
auto &MemOp = **I.memoperands_begin();
+ if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+ DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ return false;
+ }
+
unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
if (NewOpc == Opc)
return false;
+ X86AddressMode AM;
+ X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
+
I.setDesc(TII.get(NewOpc));
MachineInstrBuilder MIB(MF, I);
- if (Opc == TargetOpcode::G_LOAD)
- addOffset(MIB, 0);
- else {
+ if (Opc == TargetOpcode::G_LOAD) {
+ I.RemoveOperand(1);
+ addFullAddress(MIB, AM);
+ } else {
// G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+ I.RemoveOperand(1);
I.RemoveOperand(0);
- addOffset(MIB, 0).addUse(DefReg);
+ addFullAddress(MIB, AM).addUse(DefReg);
}
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -461,11 +524,11 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
if (DstRB.getID() != X86::GPRRegBankID)
return false;
- const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
if (!DstRC)
return false;
- const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
if (!SrcRC)
return false;
@@ -519,9 +582,8 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
else
return false;
- const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
unsigned DefReg =
- MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank));
+ MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
@@ -656,6 +718,202 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
return true;
}
+bool X86InstructionSelector::selectExtract(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ if (I.getOpcode() != TargetOpcode::G_EXTRACT)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+ int64_t Index = I.getOperand(2).getImm();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ // Meanwile handle vector type only.
+ if (!DstTy.isVector())
+ return false;
+
+ if (Index % DstTy.getSizeInBits() != 0)
+ return false; // Not extract subvector.
+
+ if (Index == 0) {
+ // Replace by extract subreg copy.
+ if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
+ if (HasVLX)
+ I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr));
+ else if (HasAVX)
+ I.setDesc(TII.get(X86::VEXTRACTF128rr));
+ else
+ return false;
+ } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
+ if (DstTy.getSizeInBits() == 128)
+ I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr));
+ else if (DstTy.getSizeInBits() == 256)
+ I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr));
+ else
+ return false;
+ } else
+ return false;
+
+ // Convert to X86 VEXTRACT immediate.
+ Index = Index / DstTy.getSizeInBits();
+ I.getOperand(2).setImm(Index);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
+ MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned SubIdx = X86::NoSubRegister;
+
+ if (!DstTy.isVector() || !SrcTy.isVector())
+ return false;
+
+ assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() &&
+ "Incorrect Src/Dst register size");
+
+ if (DstTy.getSizeInBits() == 128)
+ SubIdx = X86::sub_xmm;
+ else if (DstTy.getSizeInBits() == 256)
+ SubIdx = X86::sub_ymm;
+ else
+ return false;
+
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+
+ SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg)
+ .addReg(SrcReg, 0, SubIdx);
+
+ return true;
+}
+
+bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
+ MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+ unsigned SubIdx = X86::NoSubRegister;
+
+ // TODO: support scalar types
+ if (!DstTy.isVector() || !SrcTy.isVector())
+ return false;
+
+ assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() &&
+ "Incorrect Src/Dst register size");
+
+ if (SrcTy.getSizeInBits() == 128)
+ SubIdx = X86::sub_xmm;
+ else if (SrcTy.getSizeInBits() == 256)
+ SubIdx = X86::sub_ymm;
+ else
+ return false;
+
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+ return false;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY))
+ .addReg(DstReg, RegState::DefineNoRead, SubIdx)
+ .addReg(SrcReg);
+
+ return true;
+}
+
+bool X86InstructionSelector::selectInsert(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+
+ if (I.getOpcode() != TargetOpcode::G_INSERT)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+ const unsigned InsertReg = I.getOperand(2).getReg();
+ int64_t Index = I.getOperand(3).getImm();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT InsertRegTy = MRI.getType(InsertReg);
+
+ // Meanwile handle vector type only.
+ if (!DstTy.isVector())
+ return false;
+
+ if (Index % InsertRegTy.getSizeInBits() != 0)
+ return false; // Not insert subvector.
+
+ if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) {
+ // Replace by subreg copy.
+ if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+ }
+
+ bool HasAVX = STI.hasAVX();
+ bool HasAVX512 = STI.hasAVX512();
+ bool HasVLX = STI.hasVLX();
+
+ if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
+ if (HasVLX)
+ I.setDesc(TII.get(X86::VINSERTF32x4Z256rr));
+ else if (HasAVX)
+ I.setDesc(TII.get(X86::VINSERTF128rr));
+ else
+ return false;
+ } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
+ if (InsertRegTy.getSizeInBits() == 128)
+ I.setDesc(TII.get(X86::VINSERTF32x4Zrr));
+ else if (InsertRegTy.getSizeInBits() == 256)
+ I.setDesc(TII.get(X86::VINSERTF64x4Zrr));
+ else
+ return false;
+ } else
+ return false;
+
+ // Convert to X86 VINSERT immediate.
+ Index = Index / InsertRegTy.getSizeInBits();
+
+ I.getOperand(3).setImm(Index);
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 806d6cc888f0f..f0ed4bc16e2f9 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -16,6 +16,7 @@
#include "X86ISelLowering.h"
#include "X86TargetMachine.h"
+#include "llvm/Analysis/VectorUtils.h"
using namespace llvm;
@@ -50,9 +51,8 @@ class X86InterleavedAccessGroup {
IRBuilder<> &Builder;
/// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
- /// sub vectors of type \p T. Returns true and the sub-vectors in
- /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
- bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+ /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
+ void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
SmallVectorImpl<Instruction *> &DecomposedVectors);
/// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -80,8 +80,7 @@ public:
/// target information \p STarget.
explicit X86InterleavedAccessGroup(Instruction *I,
ArrayRef<ShuffleVectorInst *> Shuffs,
- ArrayRef<unsigned> Ind,
- const unsigned F,
+ ArrayRef<unsigned> Ind, const unsigned F,
const X86Subtarget &STarget,
IRBuilder<> &B)
: Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
@@ -102,48 +101,61 @@ bool X86InterleavedAccessGroup::isSupported() const {
uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
- if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
- return false;
+ // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
+ uint64_t ExpectedShuffleVecSize;
+ if (isa<LoadInst>(Inst))
+ ExpectedShuffleVecSize = 256;
+ else
+ ExpectedShuffleVecSize = 1024;
- // Currently, lowering is supported for 64 bits on AVX.
- if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+ if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
return false;
return true;
}
-bool X86InterleavedAccessGroup::decompose(
+void X86InterleavedAccessGroup::decompose(
Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
SmallVectorImpl<Instruction *> &DecomposedVectors) {
+
+ assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
+ "Expected Load or Shuffle");
+
Type *VecTy = VecInst->getType();
(void)VecTy;
assert(VecTy->isVectorTy() &&
DL.getTypeSizeInBits(VecTy) >=
DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
"Invalid Inst-size!!!");
- assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
- "Element type mismatched!!!");
- if (!isa<LoadInst>(VecInst))
- return false;
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
+ Value *Op0 = SVI->getOperand(0);
+ Value *Op1 = SVI->getOperand(1);
+
+ // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
+ for (unsigned i = 0; i < NumSubVectors; ++i)
+ DecomposedVectors.push_back(
+ cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
+ Op0, Op1, createSequentialMask(Builder, Indices[i],
+ SubVecTy->getVectorNumElements(), 0))));
+ return;
+ }
+ // Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);
Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
-
Value *VecBasePtr =
Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
- // Generate N loads of T type
+ // Generate N loads of T type.
for (unsigned i = 0; i < NumSubVectors; i++) {
- // TODO: Support inbounds GEP
+ // TODO: Support inbounds GEP.
Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
DecomposedVectors.push_back(NewLoad);
}
-
- return true;
}
void X86InterleavedAccessGroup::transpose_4x4(
@@ -181,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4(
// instructions/intrinsics.
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
SmallVector<Instruction *, 4> DecomposedVectors;
- VectorType *VecTy = Shuffles[0]->getType();
- // Try to generate target-sized register(/instruction).
- if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
- return false;
-
SmallVector<Value *, 4> TransposedVectors;
- // Perform matrix-transposition in order to compute interleaved
- // results by generating some sort of (optimized) target-specific
- // instructions.
+ VectorType *ShuffleTy = Shuffles[0]->getType();
+
+ if (isa<LoadInst>(Inst)) {
+ // Try to generate target-sized register(/instruction).
+ decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+
+ // Now replace the unoptimized-interleaved-vectors with the
+ // transposed-interleaved vectors.
+ for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
+ Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+ return true;
+ }
+
+ Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
+ unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+
+ // Lower the interleaved stores:
+ // 1. Decompose the interleaved wide shuffle into individual shuffle
+ // vectors.
+ decompose(Shuffles[0], Factor,
+ VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+
+ // 2. Transpose the interleaved-vectors into vectors of contiguous
+ // elements.
transpose_4x4(DecomposedVectors, TransposedVectors);
- // Now replace the unoptimized-interleaved-vectors with the
- // transposed-interleaved vectors.
- for (unsigned i = 0; i < Shuffles.size(); i++)
- Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+ // 3. Concatenate the contiguous-vectors back into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, TransposedVectors);
+
+ // 4. Generate a store instruction for wide-vec.
+ StoreInst *SI = cast<StoreInst>(Inst);
+ Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
+ SI->getAlignment());
return true;
}
@@ -220,3 +257,29 @@ bool X86TargetLowering::lowerInterleavedLoad(
return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
}
+
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+ ShuffleVectorInst *SVI,
+ unsigned Factor) const {
+ assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+ "Invalid interleave factor");
+
+ assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+ "Invalid interleaved store");
+
+ // Holds the indices of SVI that correspond to the starting index of each
+ // interleaved shuffle.
+ SmallVector<unsigned, 4> Indices;
+ auto Mask = SVI->getShuffleMask();
+ for (unsigned i = 0; i < Factor; i++)
+ Indices.push_back(Mask[i]);
+
+ ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
+
+ // Create an interleaved access group.
+ IRBuilder<> Builder(SI);
+ X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
+ Builder);
+
+ return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index bc73bb1ae8c51..6b1add8ff8ed1 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -510,12 +510,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
@@ -524,16 +518,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
X86ISD::CMPM_RND),
- X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
- X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
- X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
@@ -1171,18 +1159,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUBS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
- X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 979aaee110aa4..a584eabcc1b28 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -214,12 +214,24 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
if (!Subtarget.hasAVX())
return;
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
const LLT v8s32 = LLT::vector(8, 32);
const LLT v4s64 = LLT::vector(4, 64);
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v8s32, v4s64})
setAction({MemOp, Ty}, Legal);
+
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+ setAction({G_INSERT, Ty}, Legal);
+ for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+ setAction({G_INSERT, 1, Ty}, Legal);
}
void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -243,6 +255,18 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
if (!Subtarget.hasAVX512())
return;
+ const LLT v16s8 = LLT::vector(16, 8);
+ const LLT v8s16 = LLT::vector(8, 16);
+ const LLT v4s32 = LLT::vector(4, 32);
+ const LLT v2s64 = LLT::vector(2, 64);
+
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v32s16 = LLT::vector(32, 16);
const LLT v16s32 = LLT::vector(16, 32);
const LLT v8s64 = LLT::vector(8, 64);
@@ -256,13 +280,15 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
for (auto Ty : {v16s32, v8s64})
setAction({MemOp, Ty}, Legal);
+ for (auto Ty : {v64s8, v32s16, v16s32, v8s64})
+ setAction({G_INSERT, Ty}, Legal);
+ for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64})
+ setAction({G_INSERT, 1, Ty}, Legal);
+
/************ VLX *******************/
if (!Subtarget.hasVLX())
return;
- const LLT v4s32 = LLT::vector(4, 32);
- const LLT v8s32 = LLT::vector(8, 32);
-
for (auto Ty : {v4s32, v8s32})
setAction({G_MUL, Ty}, Legal);
}
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index dd21e2b7c4a13..8fdf10617059a 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -2,39 +2,31 @@
//
// The LLVM Compiler Infrastructure
//
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
-// This file contains the X86 implementation of the DAG scheduling mutation to
-// pair instructions back to back.
+/// \file This file contains the X86 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
//
//===----------------------------------------------------------------------===//
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetInstrInfo.h"
-
-#define DEBUG_TYPE "misched"
-
-STATISTIC(NumFused, "Number of instr pairs fused");
+#include "llvm/CodeGen/MacroFusion.h"
using namespace llvm;
-static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
- cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
-namespace {
-
-/// \brief Verify that the instruction pair, First and Second,
-/// should be scheduled back to back. If either instruction is unspecified,
-/// then verify that the other instruction may be part of a pair at all.
-static bool shouldScheduleAdjacent(const X86Subtarget &ST,
- const MachineInstr *First,
- const MachineInstr *Second) {
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
// Check if this processor supports macro-fusion. Since this is a minor
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
// proxy for SandyBridge+.
@@ -47,13 +39,10 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
FuseInc
} FuseKind;
- assert((First || Second) && "At least one instr must be specified");
- unsigned FirstOpcode = First
- ? First->getOpcode()
+ unsigned FirstOpcode = FirstMI
+ ? FirstMI->getOpcode()
: static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
- unsigned SecondOpcode = Second
- ? Second->getOpcode()
- : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
switch (SecondOpcode) {
default:
@@ -203,69 +192,11 @@ static bool shouldScheduleAdjacent(const X86Subtarget &ST,
}
}
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class X86MacroFusion : public ScheduleDAGMutation {
-public:
- X86MacroFusion() {}
-
- void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-
-void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
- ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
- const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>();
-
- // For now, assume targets can only fuse with the branch.
- SUnit &ExitSU = DAG->ExitSU;
- MachineInstr *Branch = ExitSU.getInstr();
- if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch))
- return;
-
- for (SDep &PredDep : ExitSU.Preds) {
- if (PredDep.isWeak())
- continue;
- SUnit &SU = *PredDep.getSUnit();
- MachineInstr &Pred = *SU.getInstr();
- if (!shouldScheduleAdjacent(ST, &Pred, Branch))
- continue;
-
- // Create a single weak edge from SU to ExitSU. The only effect is to cause
- // bottom-up scheduling to heavily prioritize the clustered SU. There is no
- // need to copy predecessor edges from ExitSU to SU, since top-down
- // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
- // of SU, we could create an artificial edge from the deepest root, but it
- // hasn't been needed yet.
- bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
- (void)Success;
- assert(Success && "No DAG nodes should be reachable from ExitSU");
-
- // Adjust latency of data deps between the nodes.
- for (SDep &PredDep : ExitSU.Preds)
- if (PredDep.getSUnit() == &SU)
- PredDep.setLatency(0);
- for (SDep &SuccDep : SU.Succs)
- if (SuccDep.getSUnit() == &ExitSU)
- SuccDep.setLatency(0);
-
- ++NumFused;
- DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
- SU.print(dbgs(), DAG);
- dbgs() << " - ExitSU"
- << " / " << DAG->TII->getName(Pred.getOpcode()) << " - "
- << DAG->TII->getName(Branch->getOpcode()) << '\n';);
-
- break;
- }
-}
-
-} // end namespace
-
namespace llvm {
std::unique_ptr<ScheduleDAGMutation>
createX86MacroFusionDAGMutation () {
- return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr;
+ return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
}
} // end namespace llvm
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
index e630f802e8e63..13fa2d78a0185 100644
--- a/lib/Target/X86/X86MacroFusion.h
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -2,23 +2,18 @@
//
// The LLVM Compiler Infrastructure
//
-// \file This file is distributed under the University of Illinois Open Source
+// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
-// This file contains the X86 definition of the DAG scheduling mutation to pair
-// instructions back to back.
+/// \file This file contains the X86 definition of the DAG scheduling mutation
+/// to pair instructions back to back.
//
//===----------------------------------------------------------------------===//
-#include "X86InstrInfo.h"
#include "llvm/CodeGen/MachineScheduler.h"
-//===----------------------------------------------------------------------===//
-// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
namespace llvm {
/// Note that you have to add:
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 278b57eb00b74..a9f42cacf7886 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -91,6 +91,8 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return llvm::make_unique<X86FreeBSDTargetObjectFile>();
if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+ if (TT.isOSSolaris())
+ return llvm::make_unique<X86SolarisTargetObjectFile>();
if (TT.isOSFuchsia())
return llvm::make_unique<X86FuchsiaTargetObjectFile>();
if (TT.isOSBinFormatELF())
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 4fd95717478e9..8627c06d44313 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -86,6 +86,12 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
InitializeELF(TM.Options.UseInitArray);
}
+void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
+ const TargetMachine &TM) {
+ TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ InitializeELF(TM.Options.UseInitArray);
+}
+
const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
const GlobalValue *LHS, const GlobalValue *RHS,
const TargetMachine &TM) const {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 39d2e84e5ed77..f6aa570b6332a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -66,6 +66,11 @@ namespace llvm {
void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
};
+ /// \brief This implementation is used for Solaris on x86/x86-64.
+ class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
+ void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+ };
+
/// \brief This implementation is used for Windows targets on x86 and x86-64.
class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
const MCExpr *
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 11ba7025e1b73..5ba8534d32d33 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2178,17 +2178,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
-bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2) {
- // X86 specific here are "instruction number 1st priority".
- return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
- C1.NumIVMuls, C1.NumBaseAdds,
- C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
- std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
- C2.NumIVMuls, C2.NumBaseAdds,
- C2.ScaleCost, C2.ImmCost, C2.SetupCost);
-}
-
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?
@@ -2243,6 +2232,12 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+ // TODO: We can increase these based on available vector ops.
+ MaxLoadSize = ST->is64Bit() ? 8 : 4;
+ return true;
+}
+
bool X86TTIImpl::enableInterleavedAccessVectorization() {
// TODO: We expect this to be beneficial regardless of arch,
// but there are currently some unexplained performance artifacts on Atom.
@@ -2250,6 +2245,114 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
return !(ST->isAtom());
}
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor,
+ ArrayRef<unsigned> Indices,
+ unsigned Alignment,
+ unsigned AddressSpace) {
+
+ // We currently Support only fully-interleaved groups, with no gaps.
+ // TODO: Support also strided loads (interleaved-groups with gaps).
+ if (Indices.size() && Indices.size() != Factor)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // VecTy for interleave memop is <VF*Factor x Elt>.
+ // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+ // VecTy = <12 x i32>.
+ MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+ // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+ // the VF=2, while v2i128 is an unsupported MVT vector type
+ // (see MachineValueType.h::getVectorVT()).
+ if (!LegalVT.isVector())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ unsigned VF = VecTy->getVectorNumElements() / Factor;
+ Type *ScalarTy = VecTy->getVectorElementType();
+
+ // Calculate the number of memory operations (NumOfMemOps), required
+ // for load/store the VecTy.
+ unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+ unsigned LegalVTSize = LegalVT.getStoreSize();
+ unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+ // Get the cost of one memory operation.
+ Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+ LegalVT.getVectorNumElements());
+ unsigned MemOpCost =
+ getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+ VectorType *VT = VectorType::get(ScalarTy, VF);
+ EVT ETy = TLI->getValueType(DL, VT);
+ if (!ETy.isSimple())
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
+ // TODO: Complete for other data-types and strides.
+ // Each combination of Stride, ElementTy and VF results in a different
+ // sequence; The cost tables are therefore accessed with:
+ // Factor (stride) and VectorType=VFxElemType.
+ // The Cost accounts only for the shuffle sequence;
+ // The cost of the loads/stores is accounted for separately.
+ //
+ static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+ { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
+ { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
+ { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
+ { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8
+ { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+
+ { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
+ { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
+ { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
+ { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
+ { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ };
+
+ static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+ { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
+ { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
+ { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
+ { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+ { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+ { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
+ { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
+ { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store)
+ { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+ { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store)
+ };
+
+ if (Opcode == Instruction::Load) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ } else {
+ assert(Opcode == Instruction::Store &&
+ "Expected Store Instruction at this point");
+ if (const auto *Entry =
+ CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+}
+
// Get estimation for interleaved load/store operations and strided load.
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
@@ -2358,6 +2461,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
+ if (ST->hasAVX2())
+ return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace);
+
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 09ce2c90498d9..ad0a0a2113012 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -93,6 +93,9 @@ public:
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace);
+ int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+ unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace);
int getIntImmCost(int64_t);
@@ -101,15 +104,13 @@ public:
int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
- bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
- TargetTransformInfo::LSRCost &C2);
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
-
+ bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,