summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp333
1 files changed, 267 insertions, 66 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 86c8684d14dc..7ff05034c1f2 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
@@ -37,13 +38,17 @@ using namespace llvm;
#define DEBUG_TYPE "armtti"
static cl::opt<bool> EnableMaskedLoadStores(
- "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+ "enable-arm-maskedldst", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of masked loads and stores"));
static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
+extern cl::opt<bool> DisableTailPredication;
+
+extern cl::opt<bool> EnableMaskedGatherScatters;
+
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -104,7 +109,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
return 1;
}
-int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
Type *Ty) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
@@ -512,6 +517,27 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
(EltWidth == 8);
}
+bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
+ if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
+ return false;
+
+ // This method is called in 2 places:
+ // - from the vectorizer with a scalar type, in which case we need to get
+ // this as good as we can with the limited info we have (and rely on the cost
+ // model for the rest).
+ // - from the masked intrinsic lowering pass with the actual vector type.
+ // For MVE, we have a custom lowering pass that will already have custom
+ // legalised any gathers that we can to MVE intrinsics, and want to expand all
+ // the rest. The pass runs before the masked intrinsic lowering pass, so if we
+ // are here, we know we want to expand.
+ if (isa<VectorType>(Ty))
+ return false;
+
+ unsigned EltWidth = Ty->getScalarSizeInBits();
+ return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+ (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
+}
+
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
assert(MI && "MemcpyInst expected");
@@ -640,58 +666,60 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
}
-int ARMTTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args) {
+int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- const unsigned FunctionCallDivCost = 20;
- const unsigned ReciprocalDivCost = 10;
- static const CostTblEntry CostTbl[] = {
- // Division.
- // These costs are somewhat random. Choose a cost of 20 to indicate that
- // vectorizing devision (added function call) is going to be very expensive.
- // Double registers types.
- { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
- { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
- { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
- { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
- { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
- { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
- { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
- // Quad register types.
- { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
- { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
- { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
- { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
- { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
- // Multiplication.
- };
-
if (ST->hasNEON()) {
+ const unsigned FunctionCallDivCost = 20;
+ const unsigned ReciprocalDivCost = 10;
+ static const CostTblEntry CostTbl[] = {
+ // Division.
+ // These costs are somewhat random. Choose a cost of 20 to indicate that
+ // vectorizing devision (added function call) is going to be very expensive.
+ // Double registers types.
+ { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
+ { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
+ { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
+ { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
+ { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
+ // Quad register types.
+ { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+ { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+ { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+ // Multiplication.
+ };
+
if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
return LT.first * Entry->Cost;
@@ -712,6 +740,33 @@ int ARMTTIImpl::getArithmeticInstrCost(
return Cost;
}
+ // If this operation is a shift on arm/thumb2, it might well be folded into
+ // the following instruction, hence having a cost of 0.
+ auto LooksLikeAFreeShift = [&]() {
+ if (ST->isThumb1Only() || Ty->isVectorTy())
+ return false;
+
+ if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
+ return false;
+ if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
+ return false;
+
+ // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
+ switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Xor:
+ case Instruction::Or:
+ case Instruction::ICmp:
+ return true;
+ default:
+ return false;
+ }
+ };
+ if (LooksLikeAFreeShift())
+ return 0;
+
int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
@@ -735,11 +790,13 @@ int ARMTTIImpl::getArithmeticInstrCost(
return BaseCost;
}
-int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace, const Instruction *I) {
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ const Instruction *I) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
+ if (ST->hasNEON() && Src->isVectorTy() &&
+ (Alignment && *Alignment != Align(16)) &&
Src->getVectorElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
@@ -751,13 +808,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
return BaseCost * LT.first;
}
-int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int ARMTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+ bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
@@ -772,9 +826,19 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
// matched to more than one vldN/vstN instruction.
+ int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
if (NumElts % Factor == 0 &&
- TLI->isLegalInterleavedAccessType(SubVecTy, DL))
- return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+ TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
+ return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+
+ // Some smaller than legal interleaved patterns are cheap as we can make
+ // use of the vmovn or vrev patterns to interleave a standard load. This is
+ // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
+ // promoted differently). The cost of 2 here is then a load and vrev or
+ // vmovn.
+ if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
+ VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
+ return 2 * BaseCost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -998,6 +1062,142 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
return true;
}
+static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
+ // We don't allow icmp's, and because we only look at single block loops,
+ // we simply count the icmps, i.e. there should only be 1 for the backedge.
+ if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
+ return false;
+
+ if (isa<FCmpInst>(&I))
+ return false;
+
+ // We could allow extending/narrowing FP loads/stores, but codegen is
+ // too inefficient so reject this for now.
+ if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
+ return false;
+
+ // Extends have to be extending-loads
+ if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
+ if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
+ return false;
+
+ // Truncs have to be narrowing-stores
+ if (isa<TruncInst>(&I) )
+ if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
+ return false;
+
+ return true;
+}
+
+// To set up a tail-predicated loop, we need to know the total number of
+// elements processed by that loop. Thus, we need to determine the element
+// size and:
+// 1) it should be uniform for all operations in the vector loop, so we
+// e.g. don't want any widening/narrowing operations.
+// 2) it should be smaller than i64s because we don't have vector operations
+// that work on i64s.
+// 3) we don't want elements to be reversed or shuffled, to make sure the
+// tail-predication masks/predicates the right lanes.
+//
+static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+ const DataLayout &DL,
+ const LoopAccessInfo *LAI) {
+ PredicatedScalarEvolution PSE = LAI->getPSE();
+ int ICmpCount = 0;
+ int Stride = 0;
+
+ LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
+ SmallVector<Instruction *, 16> LoadStores;
+ for (BasicBlock *BB : L->blocks()) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ if (isa<PHINode>(&I))
+ continue;
+ if (!canTailPredicateInstruction(I, ICmpCount)) {
+ LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
+ return false;
+ }
+
+ Type *T = I.getType();
+ if (T->isPointerTy())
+ T = T->getPointerElementType();
+
+ if (T->getScalarSizeInBits() > 32) {
+ LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
+ return false;
+ }
+
+ if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
+ Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
+ int64_t NextStride = getPtrStride(PSE, Ptr, L);
+ // TODO: for now only allow consecutive strides of 1. We could support
+ // other strides as long as it is uniform, but let's keep it simple for
+ // now.
+ if (Stride == 0 && NextStride == 1) {
+ Stride = NextStride;
+ continue;
+ }
+ if (Stride != NextStride) {
+ LLVM_DEBUG(dbgs() << "Different strides found, can't "
+ "tail-predicate\n.");
+ return false;
+ }
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
+ return true;
+}
+
+bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+ ScalarEvolution &SE,
+ AssumptionCache &AC,
+ TargetLibraryInfo *TLI,
+ DominatorTree *DT,
+ const LoopAccessInfo *LAI) {
+ if (DisableTailPredication)
+ return false;
+
+ // Creating a predicated vector loop is the first step for generating a
+ // tail-predicated hardware loop, for which we need the MVE masked
+ // load/stores instructions:
+ if (!ST->hasMVEIntegerOps())
+ return false;
+
+ // For now, restrict this to single block loops.
+ if (L->getNumBlocks() > 1) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
+ "loop.\n");
+ return false;
+ }
+
+ assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
+
+ HardwareLoopInfo HWLoopInfo(L);
+ if (!HWLoopInfo.canAnalyze(*LI)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "analyzable.\n");
+ return false;
+ }
+
+ // This checks if we have the low-overhead branch architecture
+ // extension, and if we will create a hardware-loop:
+ if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "profitable.\n");
+ return false;
+ }
+
+ if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
+ LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+ "a candidate.\n");
+ return false;
+ }
+
+ return canTailPredicateLoop(L, LI, SE, DL, LAI);
+}
+
+
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.
@@ -1035,6 +1235,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
unsigned Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ // Don't unroll vectorised loop. MVE does not benefit from it as much as
+ // scalar code.
+ if (I.getType()->isVectorTy())
+ return;
+
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
ImmutableCallSite CS(&I);
if (const Function *F = CS.getCalledFunction()) {
@@ -1043,10 +1248,6 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
return;
}
- // Don't unroll vectorised loop. MVE does not benefit from it as much as
- // scalar code.
- if (I.getType()->isVectorTy())
- return;
SmallVector<const Value*, 4> Operands(I.value_op_begin(),
I.value_op_end());