diff options
Diffstat (limited to 'lib/Target/AArch64/AArch64TargetTransformInfo.cpp')
| -rw-r--r-- | lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 190 | 
1 files changed, 153 insertions, 37 deletions
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 1820ad959fcb..d75fef7b0171 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -38,7 +38,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,    return (CallerBits & CalleeBits) == CalleeBits;  } -/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// Calculate the cost of materializing a 64-bit value. This helper  /// method might only calculate a fraction of a larger immediate. Therefore it  /// is valid to return a cost of ZERO.  int AArch64TTIImpl::getIntImmCost(int64_t Val) { @@ -54,7 +54,7 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {    return (64 - LZ + 15) / 16;  } -/// \brief Calculate the cost of materializing the given constant. +/// Calculate the cost of materializing the given constant.  int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {    assert(Ty->isIntegerTy()); @@ -277,7 +277,7 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,        // same as the second operand. In this case, we will generate a "long"        // version of the widening instruction.        if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) -        if (I->getOpcode() == Cast->getOpcode() && +        if (I->getOpcode() == unsigned(Cast->getOpcode()) &&              cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())            return 0;      } @@ -493,32 +493,70 @@ int AArch64TTIImpl::getArithmeticInstrCost(    int ISD = TLI->InstructionOpcodeToISD(Opcode); -  if (ISD == ISD::SDIV && -      Opd2Info == TargetTransformInfo::OK_UniformConstantValue && -      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { -    // On AArch64, scalar signed division by constants power-of-two are -    // normally expanded to the sequence ADD + CMP + SELECT + SRA. -    // The OperandValue properties many not be same as that of previous -    // operation; conservatively assume OP_None. -    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, -                                   TargetTransformInfo::OP_None, -                                   TargetTransformInfo::OP_None); -    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, -                                   TargetTransformInfo::OP_None, -                                   TargetTransformInfo::OP_None); -    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, -                                   TargetTransformInfo::OP_None, -                                   TargetTransformInfo::OP_None); -    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, -                                   TargetTransformInfo::OP_None, -                                   TargetTransformInfo::OP_None); -    return Cost; -  } -    switch (ISD) {    default:      return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,                                                  Opd1PropInfo, Opd2PropInfo); +  case ISD::SDIV: +    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue && +        Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { +      // On AArch64, scalar signed division by constants power-of-two are +      // normally expanded to the sequence ADD + CMP + SELECT + SRA. +      // The OperandValue properties many not be same as that of previous +      // operation; conservatively assume OP_None. +      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, +                                     TargetTransformInfo::OP_None, +                                     TargetTransformInfo::OP_None); +      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, +                                     TargetTransformInfo::OP_None, +                                     TargetTransformInfo::OP_None); +      Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info, +                                     TargetTransformInfo::OP_None, +                                     TargetTransformInfo::OP_None); +      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info, +                                     TargetTransformInfo::OP_None, +                                     TargetTransformInfo::OP_None); +      return Cost; +    } +    LLVM_FALLTHROUGH; +  case ISD::UDIV: +    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) { +      auto VT = TLI->getValueType(DL, Ty); +      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { +        // Vector signed division by constant are expanded to the +        // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division +        // to MULHS + SUB + SRL + ADD + SRL. +        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info, +                                             Opd2Info, +                                             TargetTransformInfo::OP_None, +                                             TargetTransformInfo::OP_None); +        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, +                                             Opd2Info, +                                             TargetTransformInfo::OP_None, +                                             TargetTransformInfo::OP_None); +        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, +                                             Opd2Info, +                                             TargetTransformInfo::OP_None, +                                             TargetTransformInfo::OP_None); +        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; +      } +    } + +    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, +                                          Opd1PropInfo, Opd2PropInfo); +    if (Ty->isVectorTy()) { +      // On AArch64, vector divisions are not supported natively and are +      // expanded into scalar divisions of each pair of elements. +      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info, +                                     Opd2Info, Opd1PropInfo, Opd2PropInfo); +      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info, +                                     Opd2Info, Opd1PropInfo, Opd2PropInfo); +      // TODO: if one of the arguments is scalar, then it's not necessary to +      // double the cost of handling the vector elements. +      Cost += Cost; +    } +    return Cost; +    case ISD::ADD:    case ISD::MUL:    case ISD::XOR: @@ -596,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,      return LT.first * 2 * AmortizationCost;    } -  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && -      Ty->getVectorNumElements() < 8) { -    // We scalarize the loads/stores because there is not v.4b register and we -    // have to promote the elements to v.4h. -    unsigned NumVecElts = Ty->getVectorNumElements(); -    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; -    // We generate 2 instructions per vector element. -    return NumVectorizableInstsToAmortize * NumVecElts * 2; +  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) { +    unsigned ProfitableNumElements; +    if (Opcode == Instruction::Store) +      // We use a custom trunc store lowering so v.4b should be profitable. +      ProfitableNumElements = 4; +    else +      // We scalarize the loads because there is not v.4b register and we +      // have to promote the elements to v.2. +      ProfitableNumElements = 8; + +    if (Ty->getVectorNumElements() < ProfitableNumElements) { +      unsigned NumVecElts = Ty->getVectorNumElements(); +      unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; +      // We generate 2 instructions per vector element. +      return NumVectorizableInstsToAmortize * NumVecElts * 2; +    }    }    return LT.first; @@ -690,14 +736,14 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,    };    int StridedLoads = countStridedLoads(L, SE); -  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads -               << " strided loads\n"); +  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads +                    << " strided loads\n");    // Pick the largest power of 2 unroll count that won't result in too many    // strided loads.    if (StridedLoads) {      UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); -    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount -                 << '\n'); +    LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " +                      << UP.MaxCount << '\n');    }  } @@ -868,3 +914,73 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,    }    return false;  } + +int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, +                                               bool IsPairwiseForm) { + +  if (IsPairwiseForm) +    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); + +  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); +  MVT MTy = LT.second; +  int ISD = TLI->InstructionOpcodeToISD(Opcode); +  assert(ISD && "Invalid opcode"); + +  // Horizontal adds can use the 'addv' instruction. We model the cost of these +  // instructions as normal vector adds. This is the only arithmetic vector +  // reduction operation for which we have an instruction. +  static const CostTblEntry CostTblNoPairwise[]{ +      {ISD::ADD, MVT::v8i8,  1}, +      {ISD::ADD, MVT::v16i8, 1}, +      {ISD::ADD, MVT::v4i16, 1}, +      {ISD::ADD, MVT::v8i16, 1}, +      {ISD::ADD, MVT::v4i32, 1}, +  }; + +  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) +    return LT.first * Entry->Cost; + +  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm); +} + +int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, +                                   Type *SubTp) { +  if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || +      Kind == TTI::SK_PermuteSingleSrc) { +    static const CostTblEntry ShuffleTbl[] = { +      // Transpose shuffle kinds can be performed with 'trn1/trn2' and +      // 'zip1/zip2' instructions. +      { TTI::SK_Transpose, MVT::v8i8,  1 }, +      { TTI::SK_Transpose, MVT::v16i8, 1 }, +      { TTI::SK_Transpose, MVT::v4i16, 1 }, +      { TTI::SK_Transpose, MVT::v8i16, 1 }, +      { TTI::SK_Transpose, MVT::v2i32, 1 }, +      { TTI::SK_Transpose, MVT::v4i32, 1 }, +      { TTI::SK_Transpose, MVT::v2i64, 1 }, +      { TTI::SK_Transpose, MVT::v2f32, 1 }, +      { TTI::SK_Transpose, MVT::v4f32, 1 }, +      { TTI::SK_Transpose, MVT::v2f64, 1 }, +      // Select shuffle kinds. +      // TODO: handle vXi8/vXi16. +      { TTI::SK_Select, MVT::v2i32, 1 }, // mov. +      { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar). +      { TTI::SK_Select, MVT::v2i64, 1 }, // mov. +      { TTI::SK_Select, MVT::v2f32, 1 }, // mov. +      { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar). +      { TTI::SK_Select, MVT::v2f64, 1 }, // mov. +      // PermuteSingleSrc shuffle kinds. +      // TODO: handle vXi8/vXi16. +      { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov. +      { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case. +      { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov. +      { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov. +      { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case. +      { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov. +    }; +    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); +    if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) +      return LT.first * Entry->Cost; +  } + +  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +}  | 
