aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp421
1 files changed, 227 insertions, 194 deletions
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index acec3c533585..864200e5f71c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -30,7 +30,8 @@ using namespace llvm;
//
//===----------------------------------------------------------------------===//
-int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -63,7 +64,8 @@ int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
}
int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -177,11 +179,12 @@ int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
break;
}
- return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+ return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -226,7 +229,7 @@ int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return TTI::TCC_Free;
break;
}
- return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+ return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
TargetTransformInfo::PopcntSupportKind
@@ -246,8 +249,7 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
for (auto &BB : L->blocks())
for (auto &I : *BB) {
if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
- ImmutableCallSite CS(&I);
- if (const Function *F = CS.getCalledFunction()) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
if (isLoweredToCall(F))
HasCall = true;
if (F->getIntrinsicID() == Intrinsic::memcpy ||
@@ -259,7 +261,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
if (isa<StoreInst>(&I)) {
Type *MemAccessTy = I.getOperand(0)->getType();
- NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0);
+ NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
+ TTI::TCK_RecipThroughput);
}
}
@@ -291,6 +294,10 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.Force = true;
}
+void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ BaseT::getPeelingPreferences(L, SE, PP);
+}
bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2) {
@@ -323,6 +330,23 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
return 0;
}
+unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) const {
+ // Don't prefetch a loop with many far apart accesses.
+ if (NumPrefetches > 16)
+ return UINT_MAX;
+
+ // Emit prefetch instructions for smaller strides in cases where we think
+ // the hardware prefetcher might not be able to keep up.
+ if (NumStridedMemAccesses > 32 &&
+ NumStridedMemAccesses == NumMemAccesses && !HasCall)
+ return 1;
+
+ return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
+}
+
bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
EVT VT = TLI->getValueType(DL, DataType);
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
@@ -341,18 +365,25 @@ static unsigned getScalarSizeInBits(Type *Ty) {
// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
// 3.
static unsigned getNumVectorRegs(Type *Ty) {
- assert(Ty->isVectorTy() && "Expected vector type");
- unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
+ auto *VTy = cast<FixedVectorType>(Ty);
+ unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
assert(WideBits > 0 && "Could not compute size of vector");
return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
}
int SystemZTTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
+
// TODO: return a good value for BB-VECTORIZER that includes the
// immediate loads, which we do not want to count for the loop
// vectorizer, since they are hopefully hoisted out of the loop. This
@@ -391,10 +422,59 @@ int SystemZTTIImpl::getArithmeticInstrCost(
}
}
- if (Ty->isVectorTy()) {
- assert(ST->hasVector() &&
- "getArithmeticInstrCost() called with vector type.");
- unsigned VF = Ty->getVectorNumElements();
+ if (!Ty->isVectorTy()) {
+ // These FP operations are supported with a dedicated instruction for
+ // float, double and fp128 (base implementation assumes float generally
+ // costs 2).
+ if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+ Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+ return 1;
+
+ // There is no native support for FRem.
+ if (Opcode == Instruction::FRem)
+ return LIBCALL_COST;
+
+ // Give discount for some combined logical operations if supported.
+ if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+ if (Opcode == Instruction::Xor) {
+ for (const Value *A : Args) {
+ if (const Instruction *I = dyn_cast<Instruction>(A))
+ if (I->hasOneUse() &&
+ (I->getOpcode() == Instruction::And ||
+ I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::Xor))
+ return 0;
+ }
+ }
+ else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+ for (const Value *A : Args) {
+ if (const Instruction *I = dyn_cast<Instruction>(A))
+ if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
+ return 0;
+ }
+ }
+ }
+
+ // Or requires one instruction, although it has custom handling for i64.
+ if (Opcode == Instruction::Or)
+ return 1;
+
+ if (Opcode == Instruction::Xor && ScalarBits == 1) {
+ if (ST->hasLoadStoreOnCond2())
+ return 5; // 2 * (li 0; loc 1); xor
+ return 7; // 2 * ipm sequences ; xor ; shift ; compare
+ }
+
+ if (DivRemConstPow2)
+ return (SignedDivRem ? SDivPow2Cost : 1);
+ if (DivRemConst)
+ return DivMulSeqCost;
+ if (SignedDivRem || UnsignedDivRem)
+ return DivInstrCost;
+ }
+ else if (ST->hasVector()) {
+ auto *VTy = cast<FixedVectorType>(Ty);
+ unsigned VF = VTy->getNumElements();
unsigned NumVectors = getNumVectorRegs(Ty);
// These vector operations are custom handled, but are still supported
@@ -407,7 +487,7 @@ int SystemZTTIImpl::getArithmeticInstrCost(
if (DivRemConstPow2)
return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
if (DivRemConst)
- return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+ return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
// Temporary hack: disable high vectorization factors with integer
// division/remainder, which will get scalarized and handled with
@@ -429,8 +509,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
unsigned ScalarCost =
- getArithmeticInstrCost(Opcode, Ty->getScalarType());
- unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+ getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
+ unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
// FIXME: VF 2 for these FP operations are currently just as
// expensive as for VF 4.
if (VF == 2)
@@ -447,101 +527,51 @@ int SystemZTTIImpl::getArithmeticInstrCost(
// There is no native support for FRem.
if (Opcode == Instruction::FRem) {
- unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+ unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
if (VF == 2 && ScalarBits == 32)
Cost *= 2;
return Cost;
}
}
- else { // Scalar:
- // These FP operations are supported with a dedicated instruction for
- // float, double and fp128 (base implementation assumes float generally
- // costs 2).
- if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
- Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
- return 1;
-
- // There is no native support for FRem.
- if (Opcode == Instruction::FRem)
- return LIBCALL_COST;
-
- // Give discount for some combined logical operations if supported.
- if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
- if (Opcode == Instruction::Xor) {
- for (const Value *A : Args) {
- if (const Instruction *I = dyn_cast<Instruction>(A))
- if (I->hasOneUse() &&
- (I->getOpcode() == Instruction::And ||
- I->getOpcode() == Instruction::Or ||
- I->getOpcode() == Instruction::Xor))
- return 0;
- }
- }
- else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
- for (const Value *A : Args) {
- if (const Instruction *I = dyn_cast<Instruction>(A))
- if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
- return 0;
- }
- }
- }
-
- // Or requires one instruction, although it has custom handling for i64.
- if (Opcode == Instruction::Or)
- return 1;
-
- if (Opcode == Instruction::Xor && ScalarBits == 1) {
- if (ST->hasLoadStoreOnCond2())
- return 5; // 2 * (li 0; loc 1); xor
- return 7; // 2 * ipm sequences ; xor ; shift ; compare
- }
-
- if (DivRemConstPow2)
- return (SignedDivRem ? SDivPow2Cost : 1);
- if (DivRemConst)
- return DivMulSeqCost;
- if (SignedDivRem || UnsignedDivRem)
- return DivInstrCost;
- }
// Fallback to the default implementation.
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
-int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
- assert (Tp->isVectorTy());
- assert (ST->hasVector() && "getShuffleCost() called.");
- unsigned NumVectors = getNumVectorRegs(Tp);
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ int Index, VectorType *SubTp) {
+ if (ST->hasVector()) {
+ unsigned NumVectors = getNumVectorRegs(Tp);
- // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+ // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
- // FP128 values are always in scalar registers, so there is no work
- // involved with a shuffle, except for broadcast. In that case register
- // moves are done with a single instruction per element.
- if (Tp->getScalarType()->isFP128Ty())
- return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+ // FP128 values are always in scalar registers, so there is no work
+ // involved with a shuffle, except for broadcast. In that case register
+ // moves are done with a single instruction per element.
+ if (Tp->getScalarType()->isFP128Ty())
+ return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
- switch (Kind) {
- case TargetTransformInfo::SK_ExtractSubvector:
- // ExtractSubvector Index indicates start offset.
+ switch (Kind) {
+ case TargetTransformInfo::SK_ExtractSubvector:
+ // ExtractSubvector Index indicates start offset.
- // Extracting a subvector from first index is a noop.
- return (Index == 0 ? 0 : NumVectors);
+ // Extracting a subvector from first index is a noop.
+ return (Index == 0 ? 0 : NumVectors);
- case TargetTransformInfo::SK_Broadcast:
- // Loop vectorizer calls here to figure out the extra cost of
- // broadcasting a loaded value to all elements of a vector. Since vlrep
- // loads and replicates with a single instruction, adjust the returned
- // value.
- return NumVectors - 1;
+ case TargetTransformInfo::SK_Broadcast:
+ // Loop vectorizer calls here to figure out the extra cost of
+ // broadcasting a loaded value to all elements of a vector. Since vlrep
+ // loads and replicates with a single instruction, adjust the returned
+ // value.
+ return NumVectors - 1;
- default:
+ default:
- // SystemZ supports single instruction permutation / replication.
- return NumVectors;
+ // SystemZ supports single instruction permutation / replication.
+ return NumVectors;
+ }
}
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -564,8 +594,9 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) {
assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
"Packing must reduce size of vector type.");
- assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
- "Packing should not change number of elements.");
+ assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
+ cast<FixedVectorType>(DstTy)->getNumElements() &&
+ "Packing should not change number of elements.");
// TODO: Since fp32 is expanded, the extract cost should always be 0.
@@ -580,7 +611,7 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) {
unsigned Cost = 0;
unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
- unsigned VF = SrcTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
for (unsigned P = 0; P < Log2Diff; ++P) {
if (NumParts > 1)
NumParts /= 2;
@@ -642,7 +673,7 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
// Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
// be either scalar or already vectorized with a same or lesser VF.
Type *ElTy = OpTy->getScalarType();
- return VectorType::get(ElTy, VF);
+ return FixedVectorType::get(ElTy, VF);
}
return nullptr;
@@ -653,8 +684,8 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
unsigned SystemZTTIImpl::
getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
const Instruction *I) {
- assert (Dst->isVectorTy());
- unsigned VF = Dst->getVectorNumElements();
+ auto *DstVTy = cast<FixedVectorType>(Dst);
+ unsigned VF = DstVTy->getNumElements();
unsigned Cost = 0;
// If we know what the widths of the compared operands, get any cost of
// converting it to match Dst. Otherwise assume same widths.
@@ -668,14 +699,50 @@ getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
}
int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // FIXME: Can the logic below also be used for these cost kinds?
+ if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
+ int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+ return BaseCost == 0 ? BaseCost : 1;
+ }
+
unsigned DstScalarBits = Dst->getScalarSizeInBits();
unsigned SrcScalarBits = Src->getScalarSizeInBits();
- if (Src->isVectorTy()) {
- assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
- assert (Dst->isVectorTy());
- unsigned VF = Src->getVectorNumElements();
+ if (!Src->isVectorTy()) {
+ assert (!Dst->isVectorTy());
+
+ if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+ if (SrcScalarBits >= 32 ||
+ (I != nullptr && isa<LoadInst>(I->getOperand(0))))
+ return 1;
+ return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
+ }
+
+ if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+ Src->isIntegerTy(1)) {
+ if (ST->hasLoadStoreOnCond2())
+ return 2; // li 0; loc 1
+
+ // This should be extension of a compare i1 result, which is done with
+ // ipm and a varying sequence of instructions.
+ unsigned Cost = 0;
+ if (Opcode == Instruction::SExt)
+ Cost = (DstScalarBits < 64 ? 3 : 4);
+ if (Opcode == Instruction::ZExt)
+ Cost = 3;
+ Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+ if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+ // If operands of an fp-type was compared, this costs +1.
+ Cost++;
+ return Cost;
+ }
+ }
+ else if (ST->hasVector()) {
+ auto *SrcVecTy = cast<FixedVectorType>(Src);
+ auto *DstVecTy = cast<FixedVectorType>(Dst);
+ unsigned VF = SrcVecTy->getNumElements();
unsigned NumDstVectors = getNumVectorRegs(Dst);
unsigned NumSrcVectors = getNumVectorRegs(Src);
@@ -720,7 +787,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// inserting and extracting the values. Base implementation does not
// realize float->int gets scalarized.
unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
- Src->getScalarType());
+ Src->getScalarType(), CostKind);
unsigned TotCost = VF * ScalarCost;
bool NeedsInserts = true, NeedsExtracts = true;
// FP128 registers do not get inserted or extracted.
@@ -731,8 +798,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
NeedsExtracts = false;
- TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
- TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
+ TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
+ TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -743,7 +810,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (Opcode == Instruction::FPTrunc) {
if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
- return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+ return VF /*ldxbr/lexbr*/ +
+ getScalarizationOverhead(DstVecTy, true, false);
else // double -> float
return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
}
@@ -756,40 +824,11 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return VF * 2;
}
// -> fp128. VF * lxdb/lxeb + extraction of elements.
- return VF + getScalarizationOverhead(Src, false, true);
+ return VF + getScalarizationOverhead(SrcVecTy, false, true);
}
}
- else { // Scalar
- assert (!Dst->isVectorTy());
-
- if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
- if (SrcScalarBits >= 32 ||
- (I != nullptr && isa<LoadInst>(I->getOperand(0))))
- return 1;
- return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
- }
- if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
- Src->isIntegerTy(1)) {
- if (ST->hasLoadStoreOnCond2())
- return 2; // li 0; loc 1
-
- // This should be extension of a compare i1 result, which is done with
- // ipm and a varying sequence of instructions.
- unsigned Cost = 0;
- if (Opcode == Instruction::SExt)
- Cost = (DstScalarBits < 64 ? 3 : 4);
- if (Opcode == Instruction::ZExt)
- Cost = 3;
- Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
- if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
- // If operands of an fp-type was compared, this costs +1.
- Cost++;
- return Cost;
- }
- }
-
- return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
}
// Scalar i8 / i16 operations will typically be made after first extending
@@ -805,10 +844,38 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
}
int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy, const Instruction *I) {
- if (ValTy->isVectorTy()) {
- assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
- unsigned VF = ValTy->getVectorNumElements();
+ Type *CondTy,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
+
+ if (!ValTy->isVectorTy()) {
+ switch (Opcode) {
+ case Instruction::ICmp: {
+ // A loaded value compared with 0 with multiple users becomes Load and
+ // Test. The load is then not foldable, so return 0 cost for the ICmp.
+ unsigned ScalarBits = ValTy->getScalarSizeInBits();
+ if (I != nullptr && ScalarBits >= 32)
+ if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+ if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
+ C->getZExtValue() == 0)
+ return 0;
+
+ unsigned Cost = 1;
+ if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+ Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
+ return Cost;
+ }
+ case Instruction::Select:
+ if (ValTy->isFloatingPointTy())
+ return 4; // No load on condition for FP - costs a conditional jump.
+ return 1; // Load On Condition / Select Register.
+ }
+ }
+ else if (ST->hasVector()) {
+ unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
// Called with a compare instruction.
if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
@@ -856,32 +923,8 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
}
}
- else { // Scalar
- switch (Opcode) {
- case Instruction::ICmp: {
- // A loaded value compared with 0 with multiple users becomes Load and
- // Test. The load is then not foldable, so return 0 cost for the ICmp.
- unsigned ScalarBits = ValTy->getScalarSizeInBits();
- if (I != nullptr && ScalarBits >= 32)
- if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
- if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
- if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
- C->getZExtValue() == 0)
- return 0;
-
- unsigned Cost = 1;
- if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
- Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
- return Cost;
- }
- case Instruction::Select:
- if (ValTy->isFloatingPointTy())
- return 4; // No load on condition for FP - costs a conditional jump.
- return 1; // Load On Condition / Select Register.
- }
- }
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
}
int SystemZTTIImpl::
@@ -995,9 +1038,14 @@ static bool isBswapIntrinsicCall(const Value *V) {
int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
assert(!Src->isVoidTy() && "Invalid type");
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return 1;
+
if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
// Store the load or its truncated or extended value in FoldedValue.
const Instruction *FoldedValue = nullptr;
@@ -1058,16 +1106,13 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// needed for using / defining the vector operands. The SystemZ version does
// roughly the same but bases the computations on vector permutations
// instead.
-int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int SystemZTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
@@ -1075,7 +1120,7 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
// Return the ceiling of dividing A by B.
auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
- unsigned NumElts = VecTy->getVectorNumElements();
+ unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
unsigned VF = NumElts / Factor;
unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
@@ -1125,22 +1170,10 @@ static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
return -1;
}
-int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Value *> Args,
- FastMathFlags FMF, unsigned VF) {
- int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
- if (Cost != -1)
- return Cost;
- return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
-}
-
-int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
- ArrayRef<Type *> Tys,
- FastMathFlags FMF,
- unsigned ScalarizationCostPassed) {
- int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
+int SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ int Cost = getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
if (Cost != -1)
return Cost;
- return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
- FMF, ScalarizationCostPassed);
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}