src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2014-11-24 09:08:18 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2014-11-24 09:08:18 +0000
commit	5ca98fd98791947eba83a1ed3f2c8191ef7afa6c (patch)
tree	f5944309621cee4fe0976be6f9ac619b7ebfc4c2 /lib/Target/X86/X86TargetTransformInfo.cpp
parent	68bcb7db193e4bc81430063148253d30a791023e (diff)
download	src-5ca98fd98791947eba83a1ed3f2c8191ef7afa6c.tar.gz src-5ca98fd98791947eba83a1ed3f2c8191ef7afa6c.zip

vendor/llvm/llvm-release_350-r216957

Notes

Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')

-rw-r--r--

lib/Target/X86/X86TargetTransformInfo.cpp

504

1 files changed, 437 insertions, 67 deletions

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index f88a666092bc..c961e2f5b2c8 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp

@@ -14,17 +14,19 @@

///

//===----------------------------------------------------------------------===//

-#define DEBUG_TYPE "x86tti"

#include "X86.h"

#include "X86TargetMachine.h"

#include "llvm/Analysis/TargetTransformInfo.h"

+#include "llvm/IR/IntrinsicInst.h"

#include "llvm/Support/Debug.h"

-#include "llvm/Target/TargetLowering.h"

#include "llvm/Target/CostTable.h"

+#include "llvm/Target/TargetLowering.h"

using namespace llvm;

+#define DEBUG_TYPE "x86tti"

// Declare the pass initialization routine locally as target-specific passes

-// don't havve a target-wide initialization entry point, and so we rely on the

+// don't have a target-wide initialization entry point, and so we rely on the

// pass constructor initialization.

namespace llvm {

void initializeX86TTIPass(PassRegistry &);

@@ -32,7 +34,7 @@ void initializeX86TTIPass(PassRegistry &);

namespace {

-class X86TTI : public ImmutablePass, public TargetTransformInfo {

+class X86TTI final : public ImmutablePass, public TargetTransformInfo {

const X86Subtarget *ST;

const X86TargetLowering *TLI;

@@ -41,25 +43,21 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo {

unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;

public:

- X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {

+ X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {

llvm_unreachable("This pass cannot be directly constructed");

}

X86TTI(const X86TargetMachine *TM)

- : ImmutablePass(ID), ST(TM->getSubtargetImpl()),

- TLI(TM->getTargetLowering()) {

+ : ImmutablePass(ID), ST(TM->getSubtargetImpl()),

+ TLI(TM->getTargetLowering()) {

initializeX86TTIPass(*PassRegistry::getPassRegistry());

}

- virtual void initializePass() {

+ void initializePass() override {

pushTTIStack(this);

}

- virtual void finalizePass() {

- popTTIStack();

- }

- virtual void getAnalysisUsage(AnalysisUsage &AU) const {

+ void getAnalysisUsage(AnalysisUsage &AU) const override {

TargetTransformInfo::getAnalysisUsage(AU);

}

@@ -67,7 +65,7 @@ public:

static char ID;

/// Provide necessary pointer adjustments for the two base classes.

- virtual void *getAdjustedAnalysisPointer(const void *ID) {

+ void *getAdjustedAnalysisPointer(const void *ID) override {

if (ID == &TargetTransformInfo::ID)

return (TargetTransformInfo*)this;

return this;

@@ -75,35 +73,43 @@ public:

/// \name Scalar TTI Implementations

/// @{

- virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;

+ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;

/// @}

/// \name Vector TTI Implementations

/// @{

- virtual unsigned getNumberOfRegisters(bool Vector) const;

- virtual unsigned getRegisterBitWidth(bool Vector) const;

- virtual unsigned getMaximumUnrollFactor() const;

- virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,

- OperandValueKind,

- OperandValueKind) const;

- virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,

- int Index, Type *SubTp) const;

- virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,

- Type *Src) const;

- virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,

- Type *CondTy) const;

- virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,

- unsigned Index) const;

- virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,

- unsigned Alignment,

- unsigned AddressSpace) const;

- virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;

- virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,

- bool IsPairwiseForm) const;

+ unsigned getNumberOfRegisters(bool Vector) const override;

+ unsigned getRegisterBitWidth(bool Vector) const override;

+ unsigned getMaximumUnrollFactor() const override;

+ unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,

+ OperandValueKind) const override;

+ unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,

+ int Index, Type *SubTp) const override;

+ unsigned getCastInstrCost(unsigned Opcode, Type *Dst,

+ Type *Src) const override;

+ unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,

+ Type *CondTy) const override;

+ unsigned getVectorInstrCost(unsigned Opcode, Type *Val,

+ unsigned Index) const override;

+ unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

+ unsigned AddressSpace) const override;

+ unsigned getAddressComputationCost(Type *PtrTy,

+ bool IsComplex) const override;

+ unsigned getReductionCost(unsigned Opcode, Type *Ty,

+ bool IsPairwiseForm) const override;

+ unsigned getIntImmCost(int64_t) const;

+ unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;

+ unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,

+ Type *Ty) const override;

+ unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,

+ Type *Ty) const override;

/// @}

};

@@ -138,13 +144,17 @@ unsigned X86TTI::getNumberOfRegisters(bool Vector) const {

if (Vector && !ST->hasSSE1())

return 0;

- if (ST->is64Bit())

+ if (ST->is64Bit()) {

+ if (Vector && ST->hasAVX512())

+ return 32;

return 16;

+ }

return 8;

}

unsigned X86TTI::getRegisterBitWidth(bool Vector) const {

if (Vector) {

+ if (ST->hasAVX512()) return 512;

if (ST->hasAVX()) return 256;

if (ST->hasSSE1()) return 128;

return 0;

@@ -177,6 +187,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

int ISD = TLI->InstructionOpcodeToISD(Opcode);

assert(ISD && "Invalid opcode");

+ static const CostTblEntry<MVT::SimpleValueType>

+ AVX2UniformConstCostTable[] = {

+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence

+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence

+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence

+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence

+ };

+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&

+ ST->hasAVX2()) {

+ int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);

+ if (Idx != -1)

+ return LT.first * AVX2UniformConstCostTable[Idx].Cost;

+ }

static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {

// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to

// customize them to detect the cases where shift amount is a scalar one.

@@ -214,6 +239,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

// Look for AVX2 lowering tricks.

if (ST->hasAVX2()) {

+ if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&

+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||

+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))

+ // On AVX2, a packed v16i16 shift left by a constant build_vector

+ // is lowered into a vector multiply (vpmullw).

+ return LT.first;

int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);

if (Idx != -1)

return LT.first * AVX2CostTable[Idx].Cost;

@@ -237,15 +269,38 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.

{ ISD::SRA, MVT::v8i16, 1 }, // psraw.

{ ISD::SRA, MVT::v4i32, 1 }, // psrad.

+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence

+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence

+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence

+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence

};

if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&

ST->hasSSE2()) {

+ // pmuldq sequence.

+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())

+ return LT.first * 15;

int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);

if (Idx != -1)

return LT.first * SSE2UniformConstCostTable[Idx].Cost;

}

+ if (ISD == ISD::SHL &&

+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {

+ EVT VT = LT.second;

+ if ((VT == MVT::v8i16 && ST->hasSSE2()) ||

+ (VT == MVT::v4i32 && ST->hasSSE41()))

+ // Vector shift left by non uniform constant can be lowered

+ // into vector multiply (pmullw/pmulld).

+ return LT.first;

+ if (VT == MVT::v4i32 && ST->hasSSE2())

+ // A vector shift left by non uniform constant is converted

+ // into a vector multiply; the new multiply is eventually

+ // lowered into a sequence of shuffles and 2 x pmuludq.

+ ISD = ISD::MUL;

+ }

static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {

// We don't correctly identify costs of casts because they are marked as

@@ -260,6 +315,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

{ ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.

{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.

{ ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.

+ { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized.

{ ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.

{ ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.

@@ -297,6 +353,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

// We don't have to scalarize unsupported ops. We can issue two half-sized

// operations and we only need to extract the upper YMM half.

// Two ops + 1 extract + 1 insert = 4.

+ { ISD::MUL, MVT::v16i16, 4 },

{ ISD::MUL, MVT::v8i32, 4 },

{ ISD::SUB, MVT::v8i32, 4 },

{ ISD::ADD, MVT::v8i32, 4 },

@@ -312,7 +369,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

// Look for AVX1 lowering tricks.

if (ST->hasAVX() && !ST->hasAVX2()) {

- int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);

+ EVT VT = LT.second;

+ // v16i16 and v8i32 shifts by non-uniform constants are lowered into a

+ // sequence of extract + two vector multiply + insert.

+ if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&

+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)

+ ISD = ISD::MUL;

+ int Idx = CostTableLookup(AVX1CostTable, ISD, VT);

if (Idx != -1)

return LT.first * AVX1CostTable[Idx].Cost;

}

@@ -332,7 +397,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

// 2x pmuludq, 2x shuffle.

if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&

!ST->hasSSE41())

- return 6;

+ return LT.first * 6;

// Fallback to the default implementation.

return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,

@@ -341,17 +406,117 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,

Type *SubTp) const {

- // We only estimate the cost of reverse shuffles.

- if (Kind != SK_Reverse)

+ // We only estimate the cost of reverse and alternate shuffles.

+ if (Kind != SK_Reverse && Kind != SK_Alternate)

return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);

- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);

- unsigned Cost = 1;

- if (LT.second.getSizeInBits() > 128)

- Cost = 3; // Extract + insert + copy.

+ if (Kind == SK_Reverse) {

+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);

+ unsigned Cost = 1;

+ if (LT.second.getSizeInBits() > 128)

+ Cost = 3; // Extract + insert + copy.

+ // Multiple by the number of parts.

+ return Cost * LT.first;

+ }

+ if (Kind == SK_Alternate) {

+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.

+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.

+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);

+ // The backend knows how to generate a single VEX.256 version of

+ // instruction VPBLENDW if the target supports AVX2.

+ if (ST->hasAVX2() && LT.second == MVT::v16i16)

+ return LT.first;

+ static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = {

+ {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd

+ {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd

+ {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps

+ {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps

+ // This shuffle is custom lowered into a sequence of:

+ // 2x vextractf128 , 2x vpblendw , 1x vinsertf128

+ {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},

+ // This shuffle is custom lowered into a long sequence of:

+ // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128

+ {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}

+ };

+ if (ST->hasAVX()) {

+ int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);

+ if (Idx != -1)

+ return LT.first * AVXAltShuffleTbl[Idx].Cost;

+ }

+ static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = {

+ // These are lowered into movsd.

+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},

+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},

+ // packed float vectors with four elements are lowered into BLENDI dag

+ // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.

+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},

+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},

+ // This shuffle generates a single pshufw.

+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},

+ // There is no instruction that matches a v16i8 alternate shuffle.

+ // The backend will expand it into the sequence 'pshufb + pshufb + or'.

+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}

+ };

+ if (ST->hasSSE41()) {

+ int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);

+ if (Idx != -1)

+ return LT.first * SSE41AltShuffleTbl[Idx].Cost;

+ }

+ static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = {

+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd

+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd

- // Multiple by the number of parts.

- return Cost * LT.first;

+ // SSE3 doesn't have 'blendps'. The following shuffles are expanded into

+ // the sequence 'shufps + pshufd'

+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},

+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},

+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or

+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or

+ };

+ if (ST->hasSSSE3()) {

+ int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);

+ if (Idx != -1)

+ return LT.first * SSSE3AltShuffleTbl[Idx].Cost;

+ }

+ static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = {

+ {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd

+ {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd

+ {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd

+ {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd

+ // This is expanded into a long sequence of four extract + four insert.

+ {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.

+ // 8 x (pinsrw + pextrw + and + movb + movzb + or)

+ {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}

+ };

+ // Fall-back (SSE3 and SSE2).

+ int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);

+ if (Idx != -1)

+ return LT.first * SSEAltShuffleTbl[Idx].Cost;

+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);

+ }

+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);

}

unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {

@@ -400,16 +565,58 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {

return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);

static const TypeConversionCostTblEntry<MVT::SimpleValueType>

+ AVX2ConversionTbl[] = {

+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },

+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },

+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },

+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },

+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },

+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },

+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },

+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },

+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },

+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },

+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },

+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },

+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },

+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },

+ };

+ static const TypeConversionCostTblEntry<MVT::SimpleValueType>

AVXConversionTbl[] = {

- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },

- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },

- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },

- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },

- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },

- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },

- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },

- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },

- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },

+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },

+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },

+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },

+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },

+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },

+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },

+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },

+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },

+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },

+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },

+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },

+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },

+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },

+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },

+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },

+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },

+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },

{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },

{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },

@@ -436,17 +643,32 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {

{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },

{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },

{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },

- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },

+ // The generic code to compute the scalar overhead is currently broken.

+ // Workaround this limitation by estimating the scalarization overhead

+ // here. We have roughly 10 instructions per scalar element.

+ // Multiply that by the vector width.

+ // FIXME: remove that when PR19268 is fixed.

+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },

+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 },

+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },

{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },

- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },

- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },

- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },

- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },

- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },

- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },

+ // This node is expanded into scalarized operations but BasicTTI is overly

+ // optimistic estimating its cost. It computes 3 per element (one

+ // vector-extract, one scalar conversion and one vector-insert). The

+ // problem is that the inserts form a read-modify-write chain so latency

+ // should be factored in too. Inflating the cost per element by 1.

+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },

+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },

};

+ if (ST->hasAVX2()) {

+ int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD,

+ DstTy.getSimpleVT(), SrcTy.getSimpleVT());

+ if (Idx != -1)

+ return AVX2ConversionTbl[Idx].Cost;

+ }

if (ST->hasAVX()) {

int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),

SrcTy.getSimpleVT());

@@ -555,7 +777,7 @@ unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,

unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

unsigned AddressSpace) const {

- // Handle non power of two vectors such as <3 x float>

+ // Handle non-power-of-two vectors such as <3 x float>

if (VectorType *VTy = dyn_cast<VectorType>(Src)) {

unsigned NumElem = VTy->getVectorNumElements();

@@ -570,7 +792,7 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,

// Cost = 128 bit store + unpack + 64 bit store.

return 3;

- // Assume that all other non power-of-two numbers are scalarized.

+ // Assume that all other non-power-of-two numbers are scalarized.

if (!isPowerOf2_32(NumElem)) {

unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,

VTy->getScalarType(),

@@ -692,3 +914,151 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,

return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);

}

+/// \brief Calculate the cost of materializing a 64-bit value. This helper

+/// method might only calculate a fraction of a larger immediate. Therefore it

+/// is valid to return a cost of ZERO.

+unsigned X86TTI::getIntImmCost(int64_t Val) const {

+ if (Val == 0)

+ return TCC_Free;

+ if (isInt<32>(Val))

+ return TCC_Basic;

+ return 2 * TCC_Basic;

+unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {

+ assert(Ty->isIntegerTy());

+ unsigned BitSize = Ty->getPrimitiveSizeInBits();

+ if (BitSize == 0)

+ return ~0U;

+ // Never hoist constants larger than 128bit, because this might lead to

+ // incorrect code generation or assertions in codegen.

+ // Fixme: Create a cost model for types larger than i128 once the codegen

+ // issues have been fixed.

+ if (BitSize > 128)

+ return TCC_Free;

+ if (Imm == 0)

+ return TCC_Free;

+ // Sign-extend all constants to a multiple of 64-bit.

+ APInt ImmVal = Imm;

+ if (BitSize & 0x3f)

+ ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);

+ // Split the constant into 64-bit chunks and calculate the cost for each

+ // chunk.

+ unsigned Cost = 0;

+ for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {

+ APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);

+ int64_t Val = Tmp.getSExtValue();

+ Cost += getIntImmCost(Val);

+ }

+ // We need at least one instruction to materialze the constant.

+ return std::max(1U, Cost);

+unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,

+ Type *Ty) const {

+ assert(Ty->isIntegerTy());

+ unsigned BitSize = Ty->getPrimitiveSizeInBits();

+ // There is no cost model for constants with a bit size of 0. Return TCC_Free

+ // here, so that constant hoisting will ignore this constant.

+ if (BitSize == 0)

+ return TCC_Free;

+ unsigned ImmIdx = ~0U;

+ switch (Opcode) {

+ default: return TCC_Free;

+ case Instruction::GetElementPtr:

+ // Always hoist the base address of a GetElementPtr. This prevents the

+ // creation of new constants for every base constant that gets constant

+ // folded with the offset.

+ if (Idx == 0)

+ return 2 * TCC_Basic;

+ return TCC_Free;

+ case Instruction::Store:

+ ImmIdx = 0;

+ break;

+ case Instruction::Add:

+ case Instruction::Sub:

+ case Instruction::Mul:

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::URem:

+ case Instruction::SRem:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor:

+ case Instruction::ICmp:

+ ImmIdx = 1;

+ break;

+ // Always return TCC_Free for the shift value of a shift instruction.

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ if (Idx == 1)

+ return TCC_Free;

+ break;

+ case Instruction::Trunc:

+ case Instruction::ZExt:

+ case Instruction::SExt:

+ case Instruction::IntToPtr:

+ case Instruction::PtrToInt:

+ case Instruction::BitCast:

+ case Instruction::PHI:

+ case Instruction::Call:

+ case Instruction::Select:

+ case Instruction::Ret:

+ case Instruction::Load:

+ break;

+ }

+ if (Idx == ImmIdx) {

+ unsigned NumConstants = (BitSize + 63) / 64;

+ unsigned Cost = X86TTI::getIntImmCost(Imm, Ty);

+ return (Cost <= NumConstants * TCC_Basic)

+ ? static_cast<unsigned>(TCC_Free)

+ : Cost;

+ }

+ return X86TTI::getIntImmCost(Imm, Ty);

+unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,

+ const APInt &Imm, Type *Ty) const {

+ assert(Ty->isIntegerTy());

+ unsigned BitSize = Ty->getPrimitiveSizeInBits();

+ // There is no cost model for constants with a bit size of 0. Return TCC_Free

+ // here, so that constant hoisting will ignore this constant.

+ if (BitSize == 0)

+ return TCC_Free;

+ switch (IID) {

+ default: return TCC_Free;

+ case Intrinsic::sadd_with_overflow:

+ case Intrinsic::uadd_with_overflow:

+ case Intrinsic::ssub_with_overflow:

+ case Intrinsic::usub_with_overflow:

+ case Intrinsic::smul_with_overflow:

+ case Intrinsic::umul_with_overflow:

+ if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))

+ return TCC_Free;

+ break;

+ case Intrinsic::experimental_stackmap:

+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))

+ return TCC_Free;

+ break;

+ case Intrinsic::experimental_patchpoint_void:

+ case Intrinsic::experimental_patchpoint_i64:

+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))

+ return TCC_Free;

+ break;

+ }

+ return X86TTI::getIntImmCost(Imm, Ty);