aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h')
-rw-r--r--contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h264
1 files changed, 154 insertions, 110 deletions
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e3b834ec42c3..324b7dcfb3ac 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfoImpl.h"
#include "llvm/CodeGen/ISDOpcodes.h"
@@ -282,6 +283,11 @@ public:
return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
}
+ std::pair<const Value *, unsigned>
+ getPredicatedAddrSpace(const Value *V) const {
+ return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
+ }
+
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;
@@ -363,8 +369,9 @@ public:
}
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
- ArrayRef<const Value *> Operands) {
- return BaseT::getGEPCost(PointeeType, Ptr, Operands);
+ ArrayRef<const Value *> Operands,
+ TTI::TargetCostKind CostKind) {
+ return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
}
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
@@ -484,7 +491,8 @@ public:
int getInlinerVectorBonusPercent() { return 150; }
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
- TTI::UnrollingPreferences &UP) {
+ TTI::UnrollingPreferences &UP,
+ OptimizationRemarkEmitter *ORE) {
// This unrolling functionality is target independent, but to provide some
// motivation for its intended use, for x86:
@@ -526,6 +534,15 @@ public:
continue;
}
+ if (ORE) {
+ ORE->emit([&]() {
+ return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
+ L->getHeader())
+ << "advising against unrolling the loop because it "
+ "contains a "
+ << ore::NV("Call", &I);
+ });
+ }
return;
}
}
@@ -653,6 +670,7 @@ public:
}
Optional<unsigned> getMaxVScale() const { return None; }
+ Optional<unsigned> getVScaleForTuning() const { return None; }
/// Estimate the overhead of scalarizing an instruction. Insert and Extract
/// are set if the demanded result elements need to be inserted and/or
@@ -686,7 +704,7 @@ public:
bool Extract) {
auto *Ty = cast<FixedVectorType>(InTy);
- APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
+ APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
@@ -737,8 +755,7 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
InstructionCost getArithmeticInstrCost(
- unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -1102,6 +1119,39 @@ public:
return LT.first;
}
+ InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
+ int VF,
+ const APInt &DemandedDstElts,
+ TTI::TargetCostKind CostKind) {
+ assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
+ "Unexpected size of DemandedDstElts.");
+
+ InstructionCost Cost;
+
+ auto *SrcVT = FixedVectorType::get(EltTy, VF);
+ auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
+
+ // The Mask shuffling cost is extract all the elements of the Mask
+ // and insert each of them Factor times into the wide vector:
+ //
+ // E.g. an interleaved group with factor 3:
+ // %mask = icmp ult <8 x i32> %vec1, %vec2
+ // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+ // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+ // The cost is estimated as extract all mask elements from the <8xi1> mask
+ // vector and insert them factor times into the <24xi1> shuffled mask
+ // vector.
+ APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
+ Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
+ /*Insert*/ false,
+ /*Extract*/ true);
+ Cost +=
+ thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
+ /*Insert*/ true, /*Extract*/ false);
+
+ return Cost;
+ }
+
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
@@ -1201,9 +1251,9 @@ public:
// used (those corresponding to elements [0:1] and [8:9] of the unlegalized
// type). The other loads are unused.
//
- // We only scale the cost of loads since interleaved store groups aren't
- // allowed to have gaps.
- if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
+ // TODO: Note that legalization can turn masked loads/stores into unmasked
+ // (legalized) loads/stores. This can be reflected in the cost.
+ if (Cost.isValid() && VecTySize > VecTyLTSize) {
// The number of loads of a legal type it will take to represent a load
// of the unlegalized vector type.
unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
@@ -1220,10 +1270,24 @@ public:
// Scale the cost of the load by the fraction of legal instructions that
// will be used.
- Cost *= UsedInsts.count() / NumLegalInsts;
+ Cost = divideCeil(UsedInsts.count() * Cost.getValue().getValue(),
+ NumLegalInsts);
}
// Then plus the cost of interleave operation.
+ assert(Indices.size() <= Factor &&
+ "Interleaved memory op has too many members");
+
+ const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
+ const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
+
+ APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
+ for (unsigned Index : Indices) {
+ assert(Index < Factor && "Invalid index for interleaved memory op");
+ for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
+ DemandedLoadStoreElts.setBit(Index + Elm * Factor);
+ }
+
if (Opcode == Instruction::Load) {
// The interleave cost is similar to extract sub vectors' elements
// from the wide vector, and insert them into sub vectors.
@@ -1233,79 +1297,56 @@ public:
// %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
// The cost is estimated as extract elements at 0, 2, 4, 6 from the
// <8 x i32> vector and insert them into a <4 x i32> vector.
-
- assert(Indices.size() <= Factor &&
- "Interleaved memory op has too many members");
-
- for (unsigned Index : Indices) {
- assert(Index < Factor && "Invalid index for interleaved memory op");
-
- // Extract elements from loaded vector for each sub vector.
- for (unsigned i = 0; i < NumSubElts; i++)
- Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
- Index + i * Factor);
- }
-
- InstructionCost InsSubCost = 0;
- for (unsigned i = 0; i < NumSubElts; i++)
- InsSubCost +=
- thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
-
+ InstructionCost InsSubCost =
+ thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+ /*Insert*/ true, /*Extract*/ false);
Cost += Indices.size() * InsSubCost;
+ Cost +=
+ thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+ /*Insert*/ false, /*Extract*/ true);
} else {
- // The interleave cost is extract all elements from sub vectors, and
+ // The interleave cost is extract elements from sub vectors, and
// insert them into the wide vector.
//
- // E.g. An interleaved store of factor 2:
- // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
- // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
- // The cost is estimated as extract all elements from both <4 x i32>
- // vectors and insert into the <8 x i32> vector.
-
- InstructionCost ExtSubCost = 0;
- for (unsigned i = 0; i < NumSubElts; i++)
- ExtSubCost +=
- thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
- Cost += ExtSubCost * Factor;
-
- for (unsigned i = 0; i < NumElts; i++)
- Cost += static_cast<T *>(this)
- ->getVectorInstrCost(Instruction::InsertElement, VT, i);
+ // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
+ // (using VF=4):
+ // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
+ // %gaps.mask = <true, true, false, true, true, false,
+ // true, true, false, true, true, false>
+ // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
+ // i32 Align, <12 x i1> %gaps.mask
+ // The cost is estimated as extract all elements (of actual members,
+ // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
+ // i32> vector.
+ InstructionCost ExtSubCost =
+ thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
+ /*Insert*/ false, /*Extract*/ true);
+ Cost += ExtSubCost * Indices.size();
+ Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
+ /*Insert*/ true,
+ /*Extract*/ false);
}
if (!UseMaskForCond)
return Cost;
Type *I8Type = Type::getInt8Ty(VT->getContext());
- auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
- SubVT = FixedVectorType::get(I8Type, NumSubElts);
-
- // The Mask shuffling cost is extract all the elements of the Mask
- // and insert each of them Factor times into the wide vector:
- //
- // E.g. an interleaved group with factor 3:
- // %mask = icmp ult <8 x i32> %vec1, %vec2
- // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
- // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
- // The cost is estimated as extract all mask elements from the <8xi1> mask
- // vector and insert them factor times into the <24xi1> shuffled mask
- // vector.
- for (unsigned i = 0; i < NumSubElts; i++)
- Cost +=
- thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
- for (unsigned i = 0; i < NumElts; i++)
- Cost +=
- thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
+ Cost += thisT()->getReplicationShuffleCost(
+ I8Type, Factor, NumSubElts,
+ UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
+ CostKind);
// The Gaps mask is invariant and created outside the loop, therefore the
// cost of creating it is not accounted for here. However if we have both
// a MaskForGaps and some other mask that guards the execution of the
// memory access, we need to account for the cost of And-ing the two masks
// inside the loop.
- if (UseMaskForGaps)
+ if (UseMaskForGaps) {
+ auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
CostKind);
+ }
return Cost;
}
@@ -1460,10 +1501,10 @@ public:
Type *CondTy = RetTy->getWithNewBitWidth(1);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ CmpInst::ICMP_EQ, CostKind);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ CmpInst::ICMP_EQ, CostKind);
}
return Cost;
}
@@ -1689,26 +1730,34 @@ public:
return thisT()->getMinMaxReductionCost(
VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
/*IsUnsigned=*/true, CostKind);
- case Intrinsic::abs:
+ case Intrinsic::abs: {
+ // abs(X) = select(icmp(X,0),X,sub(0,X))
+ Type *CondTy = RetTy->getWithNewBitWidth(1);
+ CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
+ InstructionCost Cost = 0;
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+ Pred, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+ Pred, CostKind);
+ // TODO: Should we add an OperandValueProperties::OP_Zero property?
+ Cost += thisT()->getArithmeticInstrCost(
+ BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
+ return Cost;
+ }
case Intrinsic::smax:
case Intrinsic::smin:
case Intrinsic::umax:
case Intrinsic::umin: {
- // abs(X) = select(icmp(X,0),X,sub(0,X))
// minmax(X,Y) = select(icmp(X,Y),X,Y)
Type *CondTy = RetTy->getWithNewBitWidth(1);
+ bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
+ CmpInst::Predicate Pred =
+ IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
InstructionCost Cost = 0;
- // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
- Cost +=
- thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- Cost +=
- thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- // TODO: Should we add an OperandValueProperties::OP_Zero property?
- if (IID == Intrinsic::abs)
- Cost += thisT()->getArithmeticInstrCost(
- BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+ Pred, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+ Pred, CostKind);
return Cost;
}
case Intrinsic::sadd_sat:
@@ -1719,6 +1768,7 @@ public:
Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
? Intrinsic::sadd_with_overflow
: Intrinsic::ssub_with_overflow;
+ CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
// SatMax -> Overflow && SumDiff < 0
// SatMin -> Overflow && SumDiff >= 0
@@ -1726,12 +1776,10 @@ public:
IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
nullptr, ScalarizationCostPassed);
Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
- Cost +=
- thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- Cost += 2 * thisT()->getCmpSelInstrCost(
- BinaryOperator::Select, RetTy, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+ Pred, CostKind);
+ Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
+ CondTy, Pred, CostKind);
return Cost;
}
case Intrinsic::uadd_sat:
@@ -1784,23 +1832,16 @@ public:
? BinaryOperator::Add
: BinaryOperator::Sub;
- // LHSSign -> LHS >= 0
- // RHSSign -> RHS >= 0
- // SumSign -> Sum >= 0
- //
// Add:
- // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+ // Overflow -> (Result < LHS) ^ (RHS < 0)
// Sub:
- // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+ // Overflow -> (Result < LHS) ^ (RHS > 0)
InstructionCost Cost = 0;
Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
- Cost += 3 * thisT()->getCmpSelInstrCost(
- Instruction::ICmp, SumTy, OverflowTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
Cost += 2 * thisT()->getCmpSelInstrCost(
- Instruction::Select, OverflowTy, OverflowTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
- Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
+ Instruction::ICmp, SumTy, OverflowTy,
+ CmpInst::ICMP_SGT, CostKind);
+ Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
CostKind);
return Cost;
}
@@ -1811,12 +1852,15 @@ public:
unsigned Opcode = IID == Intrinsic::uadd_with_overflow
? BinaryOperator::Add
: BinaryOperator::Sub;
+ CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
+ ? CmpInst::ICMP_ULT
+ : CmpInst::ICMP_UGT;
InstructionCost Cost = 0;
Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
Cost +=
thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ Pred, CostKind);
return Cost;
}
case Intrinsic::smul_with_overflow:
@@ -1825,9 +1869,9 @@ public:
Type *OverflowTy = RetTy->getContainedType(1);
unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
+ bool IsSigned = IID == Intrinsic::smul_with_overflow;
- unsigned ExtOp =
- IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+ unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
TTI::CastContextHint CCH = TTI::CastContextHint::None;
InstructionCost Cost = 0;
@@ -1836,18 +1880,17 @@ public:
thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
CCH, CostKind);
- Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
+ Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
CostKind, TTI::OK_AnyValue,
TTI::OK_UniformConstantValue);
- if (IID == Intrinsic::smul_with_overflow)
+ if (IsSigned)
Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
CostKind, TTI::OK_AnyValue,
TTI::OK_UniformConstantValue);
- Cost +=
- thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ Cost += thisT()->getCmpSelInstrCost(
+ BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
return Cost;
}
case Intrinsic::ctpop:
@@ -1974,16 +2017,16 @@ public:
/// \param RetTy Return value types.
/// \param Tys Argument types.
/// \returns The cost of Call instruction.
- InstructionCost
- getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
- TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
+ InstructionCost getCallInstrCost(Function *F, Type *RetTy,
+ ArrayRef<Type *> Tys,
+ TTI::TargetCostKind CostKind) {
return 10;
}
unsigned getNumberOfParts(Type *Tp) {
std::pair<InstructionCost, MVT> LT =
getTLI()->getTypeLegalizationCost(DL, Tp);
- return *LT.first.getValue();
+ return LT.first.isValid() ? *LT.first.getValue() : 0;
}
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
@@ -2060,7 +2103,8 @@ public:
// By default reductions need one shuffle per reduction level.
ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
- ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
+ ArithCost +=
+ NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
return ShuffleCost + ArithCost +
thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}