summaryrefslogtreecommitdiff
path: root/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/SystemZ/SystemZTargetTransformInfo.cpp')
-rw-r--r--lib/Target/SystemZ/SystemZTargetTransformInfo.cpp489
1 files changed, 353 insertions, 136 deletions
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index c5cdc22f2099c..129610fe095b0 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -328,6 +328,25 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
}
+// Return the bit size for the scalar type or vector element
+// type. getScalarSizeInBits() returns 0 for a pointer type.
+static unsigned getScalarSizeInBits(Type *Ty) {
+ unsigned Size =
+ (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
+ assert(Size > 0 && "Element must have non-zero size.");
+ return Size;
+}
+
+// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
+// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
+// 3.
+static unsigned getNumVectorRegs(Type *Ty) {
+ assert(Ty->isVectorTy() && "Expected vector type");
+ unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
+ assert(WideBits > 0 && "Could not compute size of vector");
+ return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+}
+
int SystemZTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
@@ -343,44 +362,59 @@ int SystemZTTIImpl::getArithmeticInstrCost(
unsigned ScalarBits = Ty->getScalarSizeInBits();
- // Div with a constant which is a power of 2 will be converted by
- // DAGCombiner to use shifts. With vector shift-element instructions, a
- // vector sdiv costs about as much as a scalar one.
- const unsigned SDivCostEstimate = 4;
- bool SDivPow2 = false;
- bool UDivPow2 = false;
- if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) &&
- Args.size() == 2) {
- const ConstantInt *CI = nullptr;
+ // There are thre cases of division and remainder: Dividing with a register
+ // needs a divide instruction. A divisor which is a power of two constant
+ // can be implemented with a sequence of shifts. Any other constant needs a
+ // multiply and shifts.
+ const unsigned DivInstrCost = 20;
+ const unsigned DivMulSeqCost = 10;
+ const unsigned SDivPow2Cost = 4;
+
+ bool SignedDivRem =
+ Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+ bool UnsignedDivRem =
+ Opcode == Instruction::UDiv || Opcode == Instruction::URem;
+
+ // Check for a constant divisor.
+ bool DivRemConst = false;
+ bool DivRemConstPow2 = false;
+ if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
if (const Constant *C = dyn_cast<Constant>(Args[1])) {
- if (C->getType()->isVectorTy())
- CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue());
+ const ConstantInt *CVal =
+ (C->getType()->isVectorTy()
+ ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
+ : dyn_cast<const ConstantInt>(C));
+ if (CVal != nullptr &&
+ (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
+ DivRemConstPow2 = true;
else
- CI = dyn_cast<const ConstantInt>(C);
- }
- if (CI != nullptr &&
- (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) {
- if (Opcode == Instruction::SDiv)
- SDivPow2 = true;
- else
- UDivPow2 = true;
+ DivRemConst = true;
}
}
if (Ty->isVectorTy()) {
- assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+ assert(ST->hasVector() &&
+ "getArithmeticInstrCost() called with vector type.");
unsigned VF = Ty->getVectorNumElements();
- unsigned NumVectors = getNumberOfParts(Ty);
+ unsigned NumVectors = getNumVectorRegs(Ty);
// These vector operations are custom handled, but are still supported
// with one instruction per vector, regardless of element size.
if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
- Opcode == Instruction::AShr || UDivPow2) {
+ Opcode == Instruction::AShr) {
return NumVectors;
}
- if (SDivPow2)
- return (NumVectors * SDivCostEstimate);
+ if (DivRemConstPow2)
+ return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
+ if (DivRemConst)
+ return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+ if ((SignedDivRem || UnsignedDivRem) && VF > 4)
+ // Temporary hack: disable high vectorization factors with integer
+ // division/remainder, which will get scalarized and handled with
+ // GR128 registers. The mischeduler is not clever enough to avoid
+ // spilling yet.
+ return 1000;
// These FP operations are supported with a single vector instruction for
// double (base implementation assumes float generally costs 2). For
@@ -395,7 +429,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
return NumVectors;
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+ unsigned ScalarCost =
+ getArithmeticInstrCost(Opcode, Ty->getScalarType());
unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
// FIXME: VF 2 for these FP operations are currently just as
// expensive as for VF 4.
@@ -432,30 +467,22 @@ int SystemZTTIImpl::getArithmeticInstrCost(
if (Opcode == Instruction::FRem)
return LIBCALL_COST;
- if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
- return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
-
// Or requires one instruction, although it has custom handling for i64.
if (Opcode == Instruction::Or)
return 1;
- if (Opcode == Instruction::Xor && ScalarBits == 1)
- // 2 * ipm sequences ; xor ; shift ; compare
- return 7;
-
- if (UDivPow2)
- return 1;
- if (SDivPow2)
- return SDivCostEstimate;
-
- // An extra extension for narrow types is needed.
- if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
- // sext of op(s) for narrow types
- return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+ if (Opcode == Instruction::Xor && ScalarBits == 1) {
+ if (ST->hasLoadStoreOnCond2())
+ return 5; // 2 * (li 0; loc 1); xor
+ return 7; // 2 * ipm sequences ; xor ; shift ; compare
+ }
- if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
- // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
- return (ScalarBits < 32 ? 4 : 2);
+ if (DivRemConstPow2)
+ return (SignedDivRem ? SDivPow2Cost : 1);
+ if (DivRemConst)
+ return DivMulSeqCost;
+ if (SignedDivRem || UnsignedDivRem)
+ return DivInstrCost;
}
// Fallback to the default implementation.
@@ -463,12 +490,11 @@ int SystemZTTIImpl::getArithmeticInstrCost(
Opd1PropInfo, Opd2PropInfo, Args);
}
-
int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
assert (Tp->isVectorTy());
assert (ST->hasVector() && "getShuffleCost() called.");
- unsigned NumVectors = getNumberOfParts(Tp);
+ unsigned NumVectors = getNumVectorRegs(Tp);
// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
@@ -523,7 +549,7 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) {
// TODO: Since fp32 is expanded, the extract cost should always be 0.
- unsigned NumParts = getNumberOfParts(SrcTy);
+ unsigned NumParts = getNumVectorRegs(SrcTy);
if (NumParts <= 2)
// Up to 2 vector registers can be truncated efficiently with pack or
// permute. The latter requires an immediate mask to be loaded, which
@@ -566,7 +592,7 @@ getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
// The bitmask will be truncated.
PackCost = getVectorTruncCost(SrcTy, DstTy);
else if (SrcScalarBits < DstScalarBits) {
- unsigned DstNumParts = getNumberOfParts(DstTy);
+ unsigned DstNumParts = getNumVectorRegs(DstTy);
// Each vector select needs its part of the bitmask unpacked.
PackCost = Log2Diff * DstNumParts;
// Extra cost for moving part of mask before unpacking.
@@ -602,6 +628,25 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
return nullptr;
}
+// Get the cost of converting a boolean vector to a vector with same width
+// and element size as Dst, plus the cost of zero extending if needed.
+unsigned SystemZTTIImpl::
+getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
+ const Instruction *I) {
+ assert (Dst->isVectorTy());
+ unsigned VF = Dst->getVectorNumElements();
+ unsigned Cost = 0;
+ // If we know what the widths of the compared operands, get any cost of
+ // converting it to match Dst. Otherwise assume same widths.
+ Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+ if (CmpOpTy != nullptr)
+ Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+ if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
+ // One 'vn' per dst vector with an immediate mask.
+ Cost += getNumVectorRegs(Dst);
+ return Cost;
+}
+
int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I) {
unsigned DstScalarBits = Dst->getScalarSizeInBits();
@@ -611,8 +656,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
assert (Dst->isVectorTy());
unsigned VF = Src->getVectorNumElements();
- unsigned NumDstVectors = getNumberOfParts(Dst);
- unsigned NumSrcVectors = getNumberOfParts(Src);
+ unsigned NumDstVectors = getNumVectorRegs(Dst);
+ unsigned NumSrcVectors = getNumVectorRegs(Src);
if (Opcode == Instruction::Trunc) {
if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
@@ -633,19 +678,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
}
- else if (SrcScalarBits == 1) {
- // This should be extension of a compare i1 result.
- // If we know what the widths of the compared operands, get the
- // cost of converting it to Dst. Otherwise assume same widths.
- unsigned Cost = 0;
- Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
- if (CmpOpTy != nullptr)
- Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
- if (Opcode == Instruction::ZExt)
- // One 'vn' per dst vector with an immediate mask.
- Cost += NumDstVectors;
- return Cost;
- }
+ else if (SrcScalarBits == 1)
+ return getBoolVecToIntConversionCost(Opcode, Dst, I);
}
if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
@@ -654,8 +688,13 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// (seems to miss on differentiating on scalar/vector types).
// Only 64 bit vector conversions are natively supported.
- if (SrcScalarBits == 64 && DstScalarBits == 64)
- return NumDstVectors;
+ if (DstScalarBits == 64) {
+ if (SrcScalarBits == 64)
+ return NumDstVectors;
+
+ if (SrcScalarBits == 1)
+ return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
+ }
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values. Base implementation does not
@@ -672,7 +711,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
NeedsExtracts = false;
- TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+ TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
+ TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -702,11 +742,18 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
else { // Scalar
assert (!Dst->isVectorTy());
- if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
- return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+ if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+ if (SrcScalarBits >= 32 ||
+ (I != nullptr && isa<LoadInst>(I->getOperand(0))))
+ return 1;
+ return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
+ }
if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
Src->isIntegerTy(1)) {
+ if (ST->hasLoadStoreOnCond2())
+ return 2; // li 0; loc 1
+
// This should be extension of a compare i1 result, which is done with
// ipm and a varying sequence of instructions.
unsigned Cost = 0;
@@ -718,7 +765,6 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
// If operands of an fp-type was compared, this costs +1.
Cost++;
-
return Cost;
}
}
@@ -726,8 +772,20 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
}
-int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- const Instruction *I) {
+// Scalar i8 / i16 operations will typically be made after first extending
+// the operands to i32.
+static unsigned getOperandsExtensionCost(const Instruction *I) {
+ unsigned ExtCost = 0;
+ for (Value *Op : I->operands())
+ // A load of i8 or i16 sign/zero extends to i32.
+ if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
+ ExtCost++;
+
+ return ExtCost;
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy, const Instruction *I) {
if (ValTy->isVectorTy()) {
assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
unsigned VF = ValTy->getVectorNumElements();
@@ -759,7 +817,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
// Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
// floats. FIXME: <2 x float> generates same code as <4 x float>.
unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
- unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+ unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
return Cost;
@@ -775,20 +833,30 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
PackCost =
getVectorBitmaskConversionCost(CmpOpTy, ValTy);
- return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+ return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
}
}
else { // Scalar
switch (Opcode) {
case Instruction::ICmp: {
+ // A loaded value compared with 0 with multiple users becomes Load and
+ // Test. The load is then not foldable, so return 0 cost for the ICmp.
+ unsigned ScalarBits = ValTy->getScalarSizeInBits();
+ if (I != nullptr && ScalarBits >= 32)
+ if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+ if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
+ C->getZExtValue() == 0)
+ return 0;
+
unsigned Cost = 1;
if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
- Cost += 2; // extend both operands
+ Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
return Cost;
}
case Instruction::Select:
if (ValTy->isFloatingPointTy())
- return 4; // No load on condition for FP, so this costs a conditional jump.
+ return 4; // No load on condition for FP - costs a conditional jump.
return 1; // Load On Condition.
}
}
@@ -804,7 +872,7 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return ((Index % 2 == 0) ? 1 : 0);
if (Opcode == Instruction::ExtractElement) {
- int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+ int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
// Give a slight penalty for moving out of vector pipeline to FXU unit.
if (Index == 0 && Val->isIntOrIntVectorTy())
@@ -816,58 +884,147 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return BaseT::getVectorInstrCost(Opcode, Val, Index);
}
+// Check if a load may be folded as a memory operand in its user.
+bool SystemZTTIImpl::
+isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
+ if (!Ld->hasOneUse())
+ return false;
+ FoldedValue = Ld;
+ const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
+ unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
+ unsigned TruncBits = 0;
+ unsigned SExtBits = 0;
+ unsigned ZExtBits = 0;
+ if (UserI->hasOneUse()) {
+ unsigned UserBits = UserI->getType()->getScalarSizeInBits();
+ if (isa<TruncInst>(UserI))
+ TruncBits = UserBits;
+ else if (isa<SExtInst>(UserI))
+ SExtBits = UserBits;
+ else if (isa<ZExtInst>(UserI))
+ ZExtBits = UserBits;
+ }
+ if (TruncBits || SExtBits || ZExtBits) {
+ FoldedValue = UserI;
+ UserI = cast<Instruction>(*UserI->user_begin());
+ // Load (single use) -> trunc/extend (single use) -> UserI
+ }
+ if ((UserI->getOpcode() == Instruction::Sub ||
+ UserI->getOpcode() == Instruction::SDiv ||
+ UserI->getOpcode() == Instruction::UDiv) &&
+ UserI->getOperand(1) != FoldedValue)
+ return false; // Not commutative, only RHS foldable.
+ // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
+ // extension was made of the load.
+ unsigned LoadOrTruncBits =
+ ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
+ switch (UserI->getOpcode()) {
+ case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
+ case Instruction::Sub:
+ case Instruction::ICmp:
+ if (LoadedBits == 32 && ZExtBits == 64)
+ return true;
+ LLVM_FALLTHROUGH;
+ case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
+ if (UserI->getOpcode() != Instruction::ICmp) {
+ if (LoadedBits == 16 &&
+ (SExtBits == 32 ||
+ (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
+ return true;
+ if (LoadOrTruncBits == 16)
+ return true;
+ }
+ LLVM_FALLTHROUGH;
+ case Instruction::SDiv:// SE: 32->64
+ if (LoadedBits == 32 && SExtBits == 64)
+ return true;
+ LLVM_FALLTHROUGH;
+ case Instruction::UDiv:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ // This also makes sense for float operations, but disabled for now due
+ // to regressions.
+ // case Instruction::FCmp:
+ // case Instruction::FAdd:
+ // case Instruction::FSub:
+ // case Instruction::FMul:
+ // case Instruction::FDiv:
+
+ // All possible extensions of memory checked above.
+
+ // Comparison between memory and immediate.
+ if (UserI->getOpcode() == Instruction::ICmp)
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
+ if (isUInt<16>(CI->getZExtValue()))
+ return true;
+ return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
+ break;
+ }
+ return false;
+}
+
+static bool isBswapIntrinsicCall(const Value *V) {
+ if (const Instruction *I = dyn_cast<Instruction>(V))
+ if (auto *CI = dyn_cast<CallInst>(I))
+ if (auto *F = CI->getCalledFunction())
+ if (F->getIntrinsicID() == Intrinsic::bswap)
+ return true;
+ return false;
+}
+
int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
unsigned Alignment, unsigned AddressSpace,
const Instruction *I) {
assert(!Src->isVoidTy() && "Invalid type");
- if (!Src->isVectorTy() && Opcode == Instruction::Load &&
- I != nullptr && I->hasOneUse()) {
- const Instruction *UserI = cast<Instruction>(*I->user_begin());
- unsigned Bits = Src->getScalarSizeInBits();
- bool FoldsLoad = false;
- switch (UserI->getOpcode()) {
- case Instruction::ICmp:
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::SDiv:
- case Instruction::UDiv:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- // This also makes sense for float operations, but disabled for now due
- // to regressions.
- // case Instruction::FCmp:
- // case Instruction::FAdd:
- // case Instruction::FSub:
- // case Instruction::FMul:
- // case Instruction::FDiv:
- FoldsLoad = (Bits == 32 || Bits == 64);
- break;
+ if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
+ // Store the load or its truncated or extended value in FoldedValue.
+ const Instruction *FoldedValue = nullptr;
+ if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
+ const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
+ assert (UserI->getNumOperands() == 2 && "Expected a binop.");
+
+ // UserI can't fold two loads, so in that case return 0 cost only
+ // half of the time.
+ for (unsigned i = 0; i < 2; ++i) {
+ if (UserI->getOperand(i) == FoldedValue)
+ continue;
+
+ if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
+ LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
+ if (!OtherLoad &&
+ (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
+ isa<ZExtInst>(OtherOp)))
+ OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
+ if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
+ return i == 0; // Both operands foldable.
+ }
}
- if (FoldsLoad) {
- assert (UserI->getNumOperands() == 2 &&
- "Expected to only handle binops.");
-
- // UserI can't fold two loads, so in that case return 0 cost only
- // half of the time.
- for (unsigned i = 0; i < 2; ++i) {
- if (UserI->getOperand(i) == I)
- continue;
- if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
- if (LI->hasOneUse())
- return i == 0;
- }
- }
+ return 0; // Only I is foldable in user.
+ }
+ }
+ unsigned NumOps =
+ (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
+
+ // Store/Load reversed saves one instruction.
+ if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) {
+ if (Opcode == Instruction::Load && I->hasOneUse()) {
+ const Instruction *LdUser = cast<Instruction>(*I->user_begin());
+ // In case of load -> bswap -> store, return normal cost for the load.
+ if (isBswapIntrinsicCall(LdUser) &&
+ (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
return 0;
- }
+ }
+ else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ const Value *StoredVal = SI->getValueOperand();
+ if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
+ return 0;
+ }
}
- unsigned NumOps = getNumberOfParts(Src);
-
if (Src->getScalarSizeInBits() == 128)
// 128 bit scalars are held in a pair of two 64 bit registers.
NumOps *= 2;
@@ -875,34 +1032,94 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return NumOps;
}
+// The generic implementation of getInterleavedMemoryOpCost() is based on
+// adding costs of the memory operations plus all the extracts and inserts
+// needed for using / defining the vector operands. The SystemZ version does
+// roughly the same but bases the computations on vector permutations
+// instead.
int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace) {
+ unsigned AddressSpace,
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
- unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
- (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
- assert (WideBits > 0 && "Could not compute size of vector");
- int NumWideParts =
- ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+ // Return the ceiling of dividing A by B.
+ auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
+
+ unsigned NumElts = VecTy->getVectorNumElements();
+ assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
+ unsigned VF = NumElts / Factor;
+ unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
+ unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
+ unsigned NumPermutes = 0;
+
+ if (Opcode == Instruction::Load) {
+ // Loading interleave groups may have gaps, which may mean fewer
+ // loads. Find out how many vectors will be loaded in total, and in how
+ // many of them each value will be in.
+ BitVector UsedInsts(NumVectorMemOps, false);
+ std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
+ for (unsigned Index : Indices)
+ for (unsigned Elt = 0; Elt < VF; ++Elt) {
+ unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
+ UsedInsts.set(Vec);
+ ValueVecs[Index].set(Vec);
+ }
+ NumVectorMemOps = UsedInsts.count();
+
+ for (unsigned Index : Indices) {
+ // Estimate that each loaded source vector containing this Index
+ // requires one operation, except that vperm can handle two input
+ // registers first time for each dst vector.
+ unsigned NumSrcVecs = ValueVecs[Index].count();
+ unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
+ assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
+ NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
+ }
+ } else {
+ // Estimate the permutes for each stored vector as the smaller of the
+ // number of elements and the number of source vectors. Subtract one per
+ // dst vector for vperm (S.A.).
+ unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
+ unsigned NumDstVecs = NumVectorMemOps;
+ assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
+ NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
+ }
- // How many source vectors are handled to produce a vectorized operand?
- int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
- int NumSrcParts =
- ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+ // Cost of load/store operations and the permutations needed.
+ return NumVectorMemOps + NumPermutes;
+}
- // A Load group may have gaps.
- unsigned NumOperands =
- ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
+ if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
+ return getNumVectorRegs(RetTy); // VPERM
+ return -1;
+}
- // Each needed permute takes two vectors as input.
- if (NumSrcParts > 1)
- NumSrcParts--;
- int NumPermutes = NumSrcParts * NumOperands;
+int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Value *> Args,
+ FastMathFlags FMF, unsigned VF) {
+ int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
+ if (Cost != -1)
+ return Cost;
+ return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+}
- // Cost of load/store operations and the permutations needed.
- return NumWideParts + NumPermutes;
+int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
+ ArrayRef<Type *> Tys,
+ FastMathFlags FMF,
+ unsigned ScalarizationCostPassed) {
+ int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
+ if (Cost != -1)
+ return Cost;
+ return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
+ FMF, ScalarizationCostPassed);
}