1 files changed, 267 insertions, 66 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 86c8684d14dc..7ff05034c1f2 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/Casting.h"
@@ -37,13 +38,17 @@ using namespace llvm;
 #define DEBUG_TYPE "armtti"
 
 static cl::opt<bool> EnableMaskedLoadStores(
-  "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
   cl::desc("Enable the generation of masked loads and stores"));
 
 static cl::opt<bool> DisableLowOverheadLoops(
   "disable-arm-loloops", cl::Hidden, cl::init(false),
   cl::desc("Disable the generation of low-overhead loops"));
 
+extern cl::opt<bool> DisableTailPredication;
+
+extern cl::opt<bool> EnableMaskedGatherScatters;
+
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                      const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -104,7 +109,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
   return 1;
 }
 
-int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
                               Type *Ty) {
   // Division by a constant can be turned into multiplication, but only if we
   // know it's constant. So it's not so much that the immediate is cheap (it's
@@ -512,6 +517,27 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
          (EltWidth == 8);
 }
 
+bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
+  if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
+    return false;
+
+  // This method is called in 2 places:
+  //  - from the vectorizer with a scalar type, in which case we need to get
+  //  this as good as we can with the limited info we have (and rely on the cost
+  //  model for the rest).
+  //  - from the masked intrinsic lowering pass with the actual vector type.
+  // For MVE, we have a custom lowering pass that will already have custom
+  // legalised any gathers that we can to MVE intrinsics, and want to expand all
+  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
+  // are here, we know we want to expand.
+  if (isa<VectorType>(Ty))
+    return false;
+
+  unsigned EltWidth = Ty->getScalarSizeInBits();
+  return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
+          (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
+}
+
 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
   const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
   assert(MI && "MemcpyInst expected");
@@ -640,58 +666,60 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-int ARMTTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
-    TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
-    TTI::OperandValueProperties Opd2PropInfo,
-    ArrayRef<const Value *> Args) {
+int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                       TTI::OperandValueKind Op1Info,
+                                       TTI::OperandValueKind Op2Info,
+                                       TTI::OperandValueProperties Opd1PropInfo,
+                                       TTI::OperandValueProperties Opd2PropInfo,
+                                       ArrayRef<const Value *> Args,
+                                       const Instruction *CxtI) {
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
-  const unsigned FunctionCallDivCost = 20;
-  const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry CostTbl[] = {
-    // Division.
-    // These costs are somewhat random. Choose a cost of 20 to indicate that
-    // vectorizing devision (added function call) is going to be very expensive.
-    // Double registers types.
-    { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
-    { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
-    { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
-    { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
-    { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
-    // Quad register types.
-    { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
-    { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
-    // Multiplication.
-  };
-
   if (ST->hasNEON()) {
+    const unsigned FunctionCallDivCost = 20;
+    const unsigned ReciprocalDivCost = 10;
+    static const CostTblEntry CostTbl[] = {
+      // Division.
+      // These costs are somewhat random. Choose a cost of 20 to indicate that
+      // vectorizing devision (added function call) is going to be very expensive.
+      // Double registers types.
+      { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
+      { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
+      { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
+      { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
+      { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
+      // Quad register types.
+      { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
+      { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
+      { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
+      // Multiplication.
+    };
+
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
 
@@ -712,6 +740,33 @@ int ARMTTIImpl::getArithmeticInstrCost(
     return Cost;
   }
 
+  // If this operation is a shift on arm/thumb2, it might well be folded into
+  // the following instruction, hence having a cost of 0.
+  auto LooksLikeAFreeShift = [&]() {
+    if (ST->isThumb1Only() || Ty->isVectorTy())
+      return false;
+
+    if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
+      return false;
+    if (Op2Info != TargetTransformInfo::OK_UniformConstantValue)
+      return false;
+
+    // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
+    switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Xor:
+    case Instruction::Or:
+    case Instruction::ICmp:
+      return true;
+    default:
+      return false;
+    }
+  };
+  if (LooksLikeAFreeShift())
+    return 0;
+
   int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
                      ? ST->getMVEVectorCostFactor()
                      : 1;
@@ -735,11 +790,13 @@ int ARMTTIImpl::getArithmeticInstrCost(
   return BaseCost;
 }
 
-int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace, const Instruction *I) {
+int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                MaybeAlign Alignment, unsigned AddressSpace,
+                                const Instruction *I) {
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
-  if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
+  if (ST->hasNEON() && Src->isVectorTy() &&
+      (Alignment && *Alignment != Align(16)) &&
       Src->getVectorElementType()->isDoubleTy()) {
     // Unaligned loads/stores are extremely inefficient.
     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
@@ -751,13 +808,10 @@ int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   return BaseCost * LT.first;
 }
 
-int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                           unsigned Factor,
-                                           ArrayRef<unsigned> Indices,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           bool UseMaskForCond,
-                                           bool UseMaskForGaps) {
+int ARMTTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+    bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
@@ -772,9 +826,19 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
     // matched to more than one vldN/vstN instruction.
+    int BaseCost = ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor() : 1;
     if (NumElts % Factor == 0 &&
-        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
-      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+        TLI->isLegalInterleavedAccessType(Factor, SubVecTy, DL))
+      return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
+
+    // Some smaller than legal interleaved patterns are cheap as we can make
+    // use of the vmovn or vrev patterns to interleave a standard load. This is
+    // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
+    // promoted differently). The cost of 2 here is then a load and vrev or
+    // vmovn.
+    if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
+        VecTy->isIntOrIntVectorTy() && DL.getTypeSizeInBits(SubVecTy) <= 64)
+      return 2 * BaseCost;
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -998,6 +1062,142 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   return true;
 }
 
+static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
+  // We don't allow icmp's, and because we only look at single block loops,
+  // we simply count the icmps, i.e. there should only be 1 for the backedge.
+  if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
+    return false;
+
+  if (isa<FCmpInst>(&I))
+    return false;
+
+  // We could allow extending/narrowing FP loads/stores, but codegen is
+  // too inefficient so reject this for now.
+  if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
+    return false;
+
+  // Extends have to be extending-loads
+  if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
+    if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
+      return false;
+
+  // Truncs have to be narrowing-stores
+  if (isa<TruncInst>(&I) )
+    if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
+      return false;
+
+  return true;
+}
+
+// To set up a tail-predicated loop, we need to know the total number of
+// elements processed by that loop. Thus, we need to determine the element
+// size and:
+// 1) it should be uniform for all operations in the vector loop, so we
+//    e.g. don't want any widening/narrowing operations.
+// 2) it should be smaller than i64s because we don't have vector operations
+//    that work on i64s.
+// 3) we don't want elements to be reversed or shuffled, to make sure the
+//    tail-predication masks/predicates the right lanes.
+//
+static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                                 const DataLayout &DL,
+                                 const LoopAccessInfo *LAI) {
+  PredicatedScalarEvolution PSE = LAI->getPSE();
+  int ICmpCount = 0;
+  int Stride = 0;
+
+  LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
+  SmallVector<Instruction *, 16> LoadStores;
+  for (BasicBlock *BB : L->blocks()) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      if (isa<PHINode>(&I))
+        continue;
+      if (!canTailPredicateInstruction(I, ICmpCount)) {
+        LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
+        return false;
+      }
+
+      Type *T  = I.getType();
+      if (T->isPointerTy())
+        T = T->getPointerElementType();
+
+      if (T->getScalarSizeInBits() > 32) {
+        LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
+        return false;
+      }
+
+      if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
+        Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
+        int64_t NextStride = getPtrStride(PSE, Ptr, L);
+        // TODO: for now only allow consecutive strides of 1. We could support
+        // other strides as long as it is uniform, but let's keep it simple for
+        // now.
+        if (Stride == 0 && NextStride == 1) {
+          Stride = NextStride;
+          continue;
+        }
+        if (Stride != NextStride) {
+          LLVM_DEBUG(dbgs() << "Different strides found, can't "
+                               "tail-predicate\n.");
+          return false;
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
+  return true;
+}
+
+bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
+                                             ScalarEvolution &SE,
+                                             AssumptionCache &AC,
+                                             TargetLibraryInfo *TLI,
+                                             DominatorTree *DT,
+                                             const LoopAccessInfo *LAI) {
+  if (DisableTailPredication)
+    return false;
+
+  // Creating a predicated vector loop is the first step for generating a
+  // tail-predicated hardware loop, for which we need the MVE masked
+  // load/stores instructions:
+  if (!ST->hasMVEIntegerOps())
+    return false;
+
+  // For now, restrict this to single block loops.
+  if (L->getNumBlocks() > 1) {
+    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
+                         "loop.\n");
+    return false;
+  }
+
+  assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
+
+  HardwareLoopInfo HWLoopInfo(L);
+  if (!HWLoopInfo.canAnalyze(*LI)) {
+    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+                         "analyzable.\n");
+    return false;
+  }
+
+  // This checks if we have the low-overhead branch architecture
+  // extension, and if we will create a hardware-loop:
+  if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
+    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+                         "profitable.\n");
+    return false;
+  }
+
+  if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
+    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+                         "a candidate.\n");
+    return false;
+  }
+
+  return canTailPredicateLoop(L, LI, SE, DL, LAI);
+}
+
+
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
@@ -1035,6 +1235,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   unsigned Cost = 0;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      // Don't unroll vectorised loop. MVE does not benefit from it as much as
+      // scalar code.
+      if (I.getType()->isVectorTy())
+        return;
+
       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
         ImmutableCallSite CS(&I);
         if (const Function *F = CS.getCalledFunction()) {
@@ -1043,10 +1248,6 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         }
         return;
       }
-      // Don't unroll vectorised loop. MVE does not benefit from it as much as
-      // scalar code.
-      if (I.getType()->isVectorTy())
-        return;
 
       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
                                             I.value_op_end());