diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 104 |
1 files changed, 90 insertions, 14 deletions
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index cf7456e9e4f5..88de84a4fd78 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -149,7 +149,7 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Align MemAlign = getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II, &IC.getAssumptionCache(), &IC.getDominatorTree()); - unsigned AlignArg = II.getNumArgOperands() - 1; + unsigned AlignArg = II.arg_size() - 1; Value *AlignArgOp = II.getArgOperand(AlignArg); MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue(); if (Align && *Align < MemAlign) { @@ -175,7 +175,7 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { PatternMatch::m_Constant(XorMask))) && II.getType() == ArgArg->getType()) { if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { - if (CI->getValue().trunc(16).isAllOnesValue()) { + if (CI->getValue().trunc(16).isAllOnes()) { auto TrueVector = IC.Builder.CreateVectorSplat( cast<FixedVectorType>(II.getType())->getNumElements(), IC.Builder.getTrue()); @@ -248,6 +248,48 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return None; } +Optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic( + InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, + APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, + std::function<void(Instruction *, unsigned, APInt, APInt &)> + SimplifyAndSetOp) const { + + // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the + // opcode specifying a Top/Bottom instruction, which can change between + // instructions. + auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) { + unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements(); + unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue(); + + // The only odd/even lanes of operand 0 will only be demanded depending + // on whether this is a top/bottom instruction. + APInt DemandedElts = + APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) + : APInt::getHighBitsSet(2, 1)); + SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts); + // The other lanes will be defined from the inserted elements. + UndefElts &= APInt::getSplat(NumElts, !IsTop ? APInt::getLowBitsSet(2, 1) + : APInt::getHighBitsSet(2, 1)); + return None; + }; + + switch (II.getIntrinsicID()) { + default: + break; + case Intrinsic::arm_mve_vcvt_narrow: + SimplifyNarrowInstrTopBottom(2); + break; + case Intrinsic::arm_mve_vqmovn: + SimplifyNarrowInstrTopBottom(4); + break; + case Intrinsic::arm_mve_vshrn: + SimplifyNarrowInstrTopBottom(7); + break; + } + + return None; +} + InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); @@ -300,7 +342,7 @@ static bool isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { if (InstSPF == SPF_SMAX && PatternMatch::match(RHS, PatternMatch::m_ConstantInt(C)) && - C->getValue() == Imm && Imm.isNegative() && (-Imm).isPowerOf2()) { + C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) { auto isSSatMin = [&](Value *MinInst) { if (isa<SelectInst>(MinInst)) { @@ -368,7 +410,7 @@ InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, } // xor a, -1 can always be folded to MVN - if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) + if (Opcode == Instruction::Xor && Imm.isAllOnes()) return 0; // Ensures negative constant of min(max()) or max(min()) patterns that @@ -381,6 +423,14 @@ InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, return 0; } + // We can convert <= -1 to < 0, which is generally quite cheap. + if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnesValue()) { + ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate(); + if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) + return std::min(getIntImmCost(Imm, Ty, CostKind), + getIntImmCost(Imm + 1, Ty, CostKind)); + } + return getIntImmCost(Imm, Ty, CostKind); } @@ -1623,13 +1673,24 @@ ARMTTIImpl::getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, TTI::TargetCostKind CostKind) { EVT ValVT = TLI->getValueType(DL, ValTy); EVT ResVT = TLI->getValueType(DL, ResTy); + if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); - if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || - (LT.second == MVT::v8i16 && - ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || - (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) + + // The legal cases are: + // VADDV u/s 8/16/32 + // VMLAV u/s 8/16/32 + // VADDLV u/s 32 + // VMLALV u/s 16/32 + // Codegen currently cannot always handle larger than legal vectors very + // well, especially for predicated reductions where the mask needs to be + // split, so restrict to 128bit or smaller input types. + unsigned RevVTSize = ResVT.getSizeInBits(); + if (ValVT.getSizeInBits() <= 128 && + ((LT.second == MVT::v16i8 && RevVTSize <= 32) || + (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64u : 32u)) || + (LT.second == MVT::v4i32 && RevVTSize <= 64))) return ST->getMVEVectorCostFactor(CostKind) * LT.first; } @@ -1949,6 +2010,20 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { // we simply count the icmps, i.e. there should only be 1 for the backedge. if (isa<ICmpInst>(&I) && ++ICmpCount > 1) return false; + // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are + // not currently canonical, but soon will be. Code without them uses icmp, and + // so is not tail predicated as per the condition above. In order to get the + // same performance we treat min and max the same as an icmp for tailpred + // purposes for the moment (we often rely on non-tailpred and higher VF's to + // pick more optimial instructions like VQDMULH. They need to be recognized + // directly by the vectorizer). + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + if ((II->getIntrinsicID() == Intrinsic::smin || + II->getIntrinsicID() == Intrinsic::smax || + II->getIntrinsicID() == Intrinsic::umin || + II->getIntrinsicID() == Intrinsic::umax) && + ++ICmpCount > 1) + return false; if (isa<FCmpInst>(&I)) return false; @@ -2035,8 +2110,9 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, return false; } if (isa<StoreInst>(I) || isa<LoadInst>(I)) { - Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1); - int64_t NextStride = getPtrStride(PSE, Ptr, L); + Value *Ptr = getLoadStorePointerOperand(&I); + Type *AccessTy = getLoadStoreType(&I); + int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L); if (NextStride == 1) { // TODO: for now only allow consecutive strides of 1. We could support // other strides as long as it is uniform, but let's keep it simple @@ -2055,8 +2131,7 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, // least if they are loop invariant. // TODO: Loop variant strides should in theory work, too, but // this requires further testing. - const SCEV *PtrScev = - replaceSymbolicStrideSCEV(PSE, llvm::ValueToValueMap(), Ptr); + const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr); if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) { const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); if (PSE.getSE()->isLoopInvariant(Step, L)) @@ -2135,14 +2210,15 @@ bool ARMTTIImpl::emitGetActiveLaneMask() const { return true; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, - TTI::UnrollingPreferences &UP) { + TTI::UnrollingPreferences &UP, + OptimizationRemarkEmitter *ORE) { // Enable Upper bound unrolling universally, not dependant upon the conditions // below. UP.UpperBound = true; // Only currently enable these preferences for M-Class cores. if (!ST->isMClass()) - return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP); + return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); // Disable loop unrolling for Oz and Os. UP.OptSizeThreshold = 0; |
