diff options
Diffstat (limited to 'lib/Target/SystemZ/SystemZTargetTransformInfo.cpp')
| -rw-r--r-- | lib/Target/SystemZ/SystemZTargetTransformInfo.cpp | 489 |
1 files changed, 353 insertions, 136 deletions
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index c5cdc22f2099c..129610fe095b0 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -328,6 +328,25 @@ bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); } +// Return the bit size for the scalar type or vector element +// type. getScalarSizeInBits() returns 0 for a pointer type. +static unsigned getScalarSizeInBits(Type *Ty) { + unsigned Size = + (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); + assert(Size > 0 && "Element must have non-zero size."); + return Size; +} + +// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector +// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of +// 3. +static unsigned getNumVectorRegs(Type *Ty) { + assert(Ty->isVectorTy() && "Expected vector type"); + unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements(); + assert(WideBits > 0 && "Could not compute size of vector"); + return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); +} + int SystemZTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, @@ -343,44 +362,59 @@ int SystemZTTIImpl::getArithmeticInstrCost( unsigned ScalarBits = Ty->getScalarSizeInBits(); - // Div with a constant which is a power of 2 will be converted by - // DAGCombiner to use shifts. With vector shift-element instructions, a - // vector sdiv costs about as much as a scalar one. - const unsigned SDivCostEstimate = 4; - bool SDivPow2 = false; - bool UDivPow2 = false; - if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) && - Args.size() == 2) { - const ConstantInt *CI = nullptr; + // There are thre cases of division and remainder: Dividing with a register + // needs a divide instruction. A divisor which is a power of two constant + // can be implemented with a sequence of shifts. Any other constant needs a + // multiply and shifts. + const unsigned DivInstrCost = 20; + const unsigned DivMulSeqCost = 10; + const unsigned SDivPow2Cost = 4; + + bool SignedDivRem = + Opcode == Instruction::SDiv || Opcode == Instruction::SRem; + bool UnsignedDivRem = + Opcode == Instruction::UDiv || Opcode == Instruction::URem; + + // Check for a constant divisor. + bool DivRemConst = false; + bool DivRemConstPow2 = false; + if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { if (const Constant *C = dyn_cast<Constant>(Args[1])) { - if (C->getType()->isVectorTy()) - CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue()); + const ConstantInt *CVal = + (C->getType()->isVectorTy() + ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue()) + : dyn_cast<const ConstantInt>(C)); + if (CVal != nullptr && + (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2())) + DivRemConstPow2 = true; else - CI = dyn_cast<const ConstantInt>(C); - } - if (CI != nullptr && - (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) { - if (Opcode == Instruction::SDiv) - SDivPow2 = true; - else - UDivPow2 = true; + DivRemConst = true; } } if (Ty->isVectorTy()) { - assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type."); + assert(ST->hasVector() && + "getArithmeticInstrCost() called with vector type."); unsigned VF = Ty->getVectorNumElements(); - unsigned NumVectors = getNumberOfParts(Ty); + unsigned NumVectors = getNumVectorRegs(Ty); // These vector operations are custom handled, but are still supported // with one instruction per vector, regardless of element size. if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || - Opcode == Instruction::AShr || UDivPow2) { + Opcode == Instruction::AShr) { return NumVectors; } - if (SDivPow2) - return (NumVectors * SDivCostEstimate); + if (DivRemConstPow2) + return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); + if (DivRemConst) + return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args); + if ((SignedDivRem || UnsignedDivRem) && VF > 4) + // Temporary hack: disable high vectorization factors with integer + // division/remainder, which will get scalarized and handled with + // GR128 registers. The mischeduler is not clever enough to avoid + // spilling yet. + return 1000; // These FP operations are supported with a single vector instruction for // double (base implementation assumes float generally costs 2). For @@ -395,7 +429,8 @@ int SystemZTTIImpl::getArithmeticInstrCost( return NumVectors; // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + unsigned ScalarCost = + getArithmeticInstrCost(Opcode, Ty->getScalarType()); unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. @@ -432,30 +467,22 @@ int SystemZTTIImpl::getArithmeticInstrCost( if (Opcode == Instruction::FRem) return LIBCALL_COST; - if (Opcode == Instruction::LShr || Opcode == Instruction::AShr) - return (ScalarBits >= 32 ? 1 : 2 /*ext*/); - // Or requires one instruction, although it has custom handling for i64. if (Opcode == Instruction::Or) return 1; - if (Opcode == Instruction::Xor && ScalarBits == 1) - // 2 * ipm sequences ; xor ; shift ; compare - return 7; - - if (UDivPow2) - return 1; - if (SDivPow2) - return SDivCostEstimate; - - // An extra extension for narrow types is needed. - if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem)) - // sext of op(s) for narrow types - return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1)); + if (Opcode == Instruction::Xor && ScalarBits == 1) { + if (ST->hasLoadStoreOnCond2()) + return 5; // 2 * (li 0; loc 1); xor + return 7; // 2 * ipm sequences ; xor ; shift ; compare + } - if (Opcode == Instruction::UDiv || Opcode == Instruction::URem) - // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r - return (ScalarBits < 32 ? 4 : 2); + if (DivRemConstPow2) + return (SignedDivRem ? SDivPow2Cost : 1); + if (DivRemConst) + return DivMulSeqCost; + if (SignedDivRem || UnsignedDivRem) + return DivInstrCost; } // Fallback to the default implementation. @@ -463,12 +490,11 @@ int SystemZTTIImpl::getArithmeticInstrCost( Opd1PropInfo, Opd2PropInfo, Args); } - int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { assert (Tp->isVectorTy()); assert (ST->hasVector() && "getShuffleCost() called."); - unsigned NumVectors = getNumberOfParts(Tp); + unsigned NumVectors = getNumVectorRegs(Tp); // TODO: Since fp32 is expanded, the shuffle cost should always be 0. @@ -523,7 +549,7 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) { // TODO: Since fp32 is expanded, the extract cost should always be 0. - unsigned NumParts = getNumberOfParts(SrcTy); + unsigned NumParts = getNumVectorRegs(SrcTy); if (NumParts <= 2) // Up to 2 vector registers can be truncated efficiently with pack or // permute. The latter requires an immediate mask to be loaded, which @@ -566,7 +592,7 @@ getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) { // The bitmask will be truncated. PackCost = getVectorTruncCost(SrcTy, DstTy); else if (SrcScalarBits < DstScalarBits) { - unsigned DstNumParts = getNumberOfParts(DstTy); + unsigned DstNumParts = getNumVectorRegs(DstTy); // Each vector select needs its part of the bitmask unpacked. PackCost = Log2Diff * DstNumParts; // Extra cost for moving part of mask before unpacking. @@ -602,6 +628,25 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { return nullptr; } +// Get the cost of converting a boolean vector to a vector with same width +// and element size as Dst, plus the cost of zero extending if needed. +unsigned SystemZTTIImpl:: +getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, + const Instruction *I) { + assert (Dst->isVectorTy()); + unsigned VF = Dst->getVectorNumElements(); + unsigned Cost = 0; + // If we know what the widths of the compared operands, get any cost of + // converting it to match Dst. Otherwise assume same widths. + Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); + if (CmpOpTy != nullptr) + Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); + if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) + // One 'vn' per dst vector with an immediate mask. + Cost += getNumVectorRegs(Dst); + return Cost; +} + int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { unsigned DstScalarBits = Dst->getScalarSizeInBits(); @@ -611,8 +656,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, assert (ST->hasVector() && "getCastInstrCost() called with vector type."); assert (Dst->isVectorTy()); unsigned VF = Src->getVectorNumElements(); - unsigned NumDstVectors = getNumberOfParts(Dst); - unsigned NumSrcVectors = getNumberOfParts(Src); + unsigned NumDstVectors = getNumVectorRegs(Dst); + unsigned NumSrcVectors = getNumVectorRegs(Src); if (Opcode == Instruction::Trunc) { if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) @@ -633,19 +678,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; } - else if (SrcScalarBits == 1) { - // This should be extension of a compare i1 result. - // If we know what the widths of the compared operands, get the - // cost of converting it to Dst. Otherwise assume same widths. - unsigned Cost = 0; - Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); - if (CmpOpTy != nullptr) - Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst); - if (Opcode == Instruction::ZExt) - // One 'vn' per dst vector with an immediate mask. - Cost += NumDstVectors; - return Cost; - } + else if (SrcScalarBits == 1) + return getBoolVecToIntConversionCost(Opcode, Dst, I); } if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || @@ -654,8 +688,13 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // (seems to miss on differentiating on scalar/vector types). // Only 64 bit vector conversions are natively supported. - if (SrcScalarBits == 64 && DstScalarBits == 64) - return NumDstVectors; + if (DstScalarBits == 64) { + if (SrcScalarBits == 64) + return NumDstVectors; + + if (SrcScalarBits == 1) + return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; + } // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. Base implementation does not @@ -672,7 +711,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) NeedsExtracts = false; - TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts); + TotCost += getScalarizationOverhead(Src, false, NeedsExtracts); + TotCost += getScalarizationOverhead(Dst, NeedsInserts, false); // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) @@ -702,11 +742,18 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, else { // Scalar assert (!Dst->isVectorTy()); - if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) - return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/); + if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { + if (SrcScalarBits >= 32 || + (I != nullptr && isa<LoadInst>(I->getOperand(0)))) + return 1; + return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; + } if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && Src->isIntegerTy(1)) { + if (ST->hasLoadStoreOnCond2()) + return 2; // li 0; loc 1 + // This should be extension of a compare i1 result, which is done with // ipm and a varying sequence of instructions. unsigned Cost = 0; @@ -718,7 +765,6 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) // If operands of an fp-type was compared, this costs +1. Cost++; - return Cost; } } @@ -726,8 +772,20 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return BaseT::getCastInstrCost(Opcode, Dst, Src, I); } -int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - const Instruction *I) { +// Scalar i8 / i16 operations will typically be made after first extending +// the operands to i32. +static unsigned getOperandsExtensionCost(const Instruction *I) { + unsigned ExtCost = 0; + for (Value *Op : I->operands()) + // A load of i8 or i16 sign/zero extends to i32. + if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op)) + ExtCost++; + + return ExtCost; +} + +int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy, const Instruction *I) { if (ValTy->isVectorTy()) { assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type."); unsigned VF = ValTy->getVectorNumElements(); @@ -759,7 +817,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of // floats. FIXME: <2 x float> generates same code as <4 x float>. unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); - unsigned NumVecs_cmp = getNumberOfParts(ValTy); + unsigned NumVecs_cmp = getNumVectorRegs(ValTy); unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); return Cost; @@ -775,20 +833,30 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT PackCost = getVectorBitmaskConversionCost(CmpOpTy, ValTy); - return getNumberOfParts(ValTy) /*vsel*/ + PackCost; + return getNumVectorRegs(ValTy) /*vsel*/ + PackCost; } } else { // Scalar switch (Opcode) { case Instruction::ICmp: { + // A loaded value compared with 0 with multiple users becomes Load and + // Test. The load is then not foldable, so return 0 cost for the ICmp. + unsigned ScalarBits = ValTy->getScalarSizeInBits(); + if (I != nullptr && ScalarBits >= 32) + if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0))) + if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1))) + if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && + C->getZExtValue() == 0) + return 0; + unsigned Cost = 1; if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) - Cost += 2; // extend both operands + Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); return Cost; } case Instruction::Select: if (ValTy->isFloatingPointTy()) - return 4; // No load on condition for FP, so this costs a conditional jump. + return 4; // No load on condition for FP - costs a conditional jump. return 1; // Load On Condition. } } @@ -804,7 +872,7 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return ((Index % 2 == 0) ? 1 : 0); if (Opcode == Instruction::ExtractElement) { - int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1); + int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1); // Give a slight penalty for moving out of vector pipeline to FXU unit. if (Index == 0 && Val->isIntOrIntVectorTy()) @@ -816,58 +884,147 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return BaseT::getVectorInstrCost(Opcode, Val, Index); } +// Check if a load may be folded as a memory operand in its user. +bool SystemZTTIImpl:: +isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { + if (!Ld->hasOneUse()) + return false; + FoldedValue = Ld; + const Instruction *UserI = cast<Instruction>(*Ld->user_begin()); + unsigned LoadedBits = getScalarSizeInBits(Ld->getType()); + unsigned TruncBits = 0; + unsigned SExtBits = 0; + unsigned ZExtBits = 0; + if (UserI->hasOneUse()) { + unsigned UserBits = UserI->getType()->getScalarSizeInBits(); + if (isa<TruncInst>(UserI)) + TruncBits = UserBits; + else if (isa<SExtInst>(UserI)) + SExtBits = UserBits; + else if (isa<ZExtInst>(UserI)) + ZExtBits = UserBits; + } + if (TruncBits || SExtBits || ZExtBits) { + FoldedValue = UserI; + UserI = cast<Instruction>(*UserI->user_begin()); + // Load (single use) -> trunc/extend (single use) -> UserI + } + if ((UserI->getOpcode() == Instruction::Sub || + UserI->getOpcode() == Instruction::SDiv || + UserI->getOpcode() == Instruction::UDiv) && + UserI->getOperand(1) != FoldedValue) + return false; // Not commutative, only RHS foldable. + // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an + // extension was made of the load. + unsigned LoadOrTruncBits = + ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); + switch (UserI->getOpcode()) { + case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 + case Instruction::Sub: + case Instruction::ICmp: + if (LoadedBits == 32 && ZExtBits == 64) + return true; + LLVM_FALLTHROUGH; + case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 + if (UserI->getOpcode() != Instruction::ICmp) { + if (LoadedBits == 16 && + (SExtBits == 32 || + (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) + return true; + if (LoadOrTruncBits == 16) + return true; + } + LLVM_FALLTHROUGH; + case Instruction::SDiv:// SE: 32->64 + if (LoadedBits == 32 && SExtBits == 64) + return true; + LLVM_FALLTHROUGH; + case Instruction::UDiv: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // This also makes sense for float operations, but disabled for now due + // to regressions. + // case Instruction::FCmp: + // case Instruction::FAdd: + // case Instruction::FSub: + // case Instruction::FMul: + // case Instruction::FDiv: + + // All possible extensions of memory checked above. + + // Comparison between memory and immediate. + if (UserI->getOpcode() == Instruction::ICmp) + if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1))) + if (isUInt<16>(CI->getZExtValue())) + return true; + return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); + break; + } + return false; +} + +static bool isBswapIntrinsicCall(const Value *V) { + if (const Instruction *I = dyn_cast<Instruction>(V)) + if (auto *CI = dyn_cast<CallInst>(I)) + if (auto *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::bswap) + return true; + return false; +} + int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace, const Instruction *I) { assert(!Src->isVoidTy() && "Invalid type"); - if (!Src->isVectorTy() && Opcode == Instruction::Load && - I != nullptr && I->hasOneUse()) { - const Instruction *UserI = cast<Instruction>(*I->user_begin()); - unsigned Bits = Src->getScalarSizeInBits(); - bool FoldsLoad = false; - switch (UserI->getOpcode()) { - case Instruction::ICmp: - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::SDiv: - case Instruction::UDiv: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - // This also makes sense for float operations, but disabled for now due - // to regressions. - // case Instruction::FCmp: - // case Instruction::FAdd: - // case Instruction::FSub: - // case Instruction::FMul: - // case Instruction::FDiv: - FoldsLoad = (Bits == 32 || Bits == 64); - break; + if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { + // Store the load or its truncated or extended value in FoldedValue. + const Instruction *FoldedValue = nullptr; + if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) { + const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin()); + assert (UserI->getNumOperands() == 2 && "Expected a binop."); + + // UserI can't fold two loads, so in that case return 0 cost only + // half of the time. + for (unsigned i = 0; i < 2; ++i) { + if (UserI->getOperand(i) == FoldedValue) + continue; + + if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){ + LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp); + if (!OtherLoad && + (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) || + isa<ZExtInst>(OtherOp))) + OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0)); + if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/)) + return i == 0; // Both operands foldable. + } } - if (FoldsLoad) { - assert (UserI->getNumOperands() == 2 && - "Expected to only handle binops."); - - // UserI can't fold two loads, so in that case return 0 cost only - // half of the time. - for (unsigned i = 0; i < 2; ++i) { - if (UserI->getOperand(i) == I) - continue; - if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) { - if (LI->hasOneUse()) - return i == 0; - } - } + return 0; // Only I is foldable in user. + } + } + unsigned NumOps = + (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src)); + + // Store/Load reversed saves one instruction. + if (!Src->isVectorTy() && NumOps == 1 && I != nullptr) { + if (Opcode == Instruction::Load && I->hasOneUse()) { + const Instruction *LdUser = cast<Instruction>(*I->user_begin()); + // In case of load -> bswap -> store, return normal cost for the load. + if (isBswapIntrinsicCall(LdUser) && + (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin()))) return 0; - } + } + else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { + const Value *StoredVal = SI->getValueOperand(); + if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal)) + return 0; + } } - unsigned NumOps = getNumberOfParts(Src); - if (Src->getScalarSizeInBits() == 128) // 128 bit scalars are held in a pair of two 64 bit registers. NumOps *= 2; @@ -875,34 +1032,94 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return NumOps; } +// The generic implementation of getInterleavedMemoryOpCost() is based on +// adding costs of the memory operations plus all the extracts and inserts +// needed for using / defining the vector operands. The SystemZ version does +// roughly the same but bases the computations on vector permutations +// instead. int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool UseMaskForCond, + bool UseMaskForGaps) { + if (UseMaskForCond || UseMaskForGaps) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); assert(isa<VectorType>(VecTy) && "Expect a vector type for interleaved memory op"); - unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ? - (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits()); - assert (WideBits > 0 && "Could not compute size of vector"); - int NumWideParts = - ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); + // Return the ceiling of dividing A by B. + auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; + + unsigned NumElts = VecTy->getVectorNumElements(); + assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); + unsigned VF = NumElts / Factor; + unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy)); + unsigned NumVectorMemOps = getNumVectorRegs(VecTy); + unsigned NumPermutes = 0; + + if (Opcode == Instruction::Load) { + // Loading interleave groups may have gaps, which may mean fewer + // loads. Find out how many vectors will be loaded in total, and in how + // many of them each value will be in. + BitVector UsedInsts(NumVectorMemOps, false); + std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); + for (unsigned Index : Indices) + for (unsigned Elt = 0; Elt < VF; ++Elt) { + unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; + UsedInsts.set(Vec); + ValueVecs[Index].set(Vec); + } + NumVectorMemOps = UsedInsts.count(); + + for (unsigned Index : Indices) { + // Estimate that each loaded source vector containing this Index + // requires one operation, except that vperm can handle two input + // registers first time for each dst vector. + unsigned NumSrcVecs = ValueVecs[Index].count(); + unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U); + assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources"); + NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs); + } + } else { + // Estimate the permutes for each stored vector as the smaller of the + // number of elements and the number of source vectors. Subtract one per + // dst vector for vperm (S.A.). + unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor); + unsigned NumDstVecs = NumVectorMemOps; + assert (NumSrcVecs > 1 && "Expected at least two source vectors."); + NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; + } - // How many source vectors are handled to produce a vectorized operand? - int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts); - int NumSrcParts = - ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts); + // Cost of load/store operations and the permutations needed. + return NumVectorMemOps + NumPermutes; +} - // A Load group may have gaps. - unsigned NumOperands = - ((Opcode == Instruction::Load) ? Indices.size() : Factor); +static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) { + if (RetTy->isVectorTy() && ID == Intrinsic::bswap) + return getNumVectorRegs(RetTy); // VPERM + return -1; +} - // Each needed permute takes two vectors as input. - if (NumSrcParts > 1) - NumSrcParts--; - int NumPermutes = NumSrcParts * NumOperands; +int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef<Value *> Args, + FastMathFlags FMF, unsigned VF) { + int Cost = getVectorIntrinsicInstrCost(ID, RetTy); + if (Cost != -1) + return Cost; + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); +} - // Cost of load/store operations and the permutations needed. - return NumWideParts + NumPermutes; +int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef<Type *> Tys, + FastMathFlags FMF, + unsigned ScalarizationCostPassed) { + int Cost = getVectorIntrinsicInstrCost(ID, RetTy); + if (Cost != -1) + return Cost; + return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, + FMF, ScalarizationCostPassed); } |
