summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp144
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp74
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h23
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp1305
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp730
-rw-r--r--llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h95
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp301
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h372
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp31
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h55
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.h8
-rw-r--r--llvm/lib/Transforms/Vectorize/VectorCombine.cpp699
-rw-r--r--llvm/lib/Transforms/Vectorize/Vectorize.cpp4
16 files changed, 2661 insertions, 1190 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7478daa2a0a52..9b81afbb4b6cb 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -50,7 +50,6 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -129,22 +128,6 @@ public:
private:
unsigned getPointerAddressSpace(Value *I);
- unsigned getAlignment(LoadInst *LI) const {
- unsigned Align = LI->getAlignment();
- if (Align != 0)
- return Align;
-
- return DL.getABITypeAlignment(LI->getType());
- }
-
- unsigned getAlignment(StoreInst *SI) const {
- unsigned Align = SI->getAlignment();
- if (Align != 0)
- return Align;
-
- return DL.getABITypeAlignment(SI->getValueOperand()->getType());
- }
-
static const unsigned MaxDepth = 3;
bool isConsecutiveAccess(Value *A, Value *B);
@@ -447,20 +430,78 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
// Now we need to prove that adding IdxDiff to ValA won't overflow.
bool Safe = false;
+ auto CheckFlags = [](Instruction *I, bool Signed) {
+ BinaryOperator *BinOpI = cast<BinaryOperator>(I);
+ return (Signed && BinOpI->hasNoSignedWrap()) ||
+ (!Signed && BinOpI->hasNoUnsignedWrap());
+ };
+
// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
// ValA, we're okay.
if (OpB->getOpcode() == Instruction::Add &&
isa<ConstantInt>(OpB->getOperand(1)) &&
- IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
- if (Signed)
- Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
- else
- Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
+ IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
+ CheckFlags(OpB, Signed))
+ Safe = true;
+
+ // Second attempt: If both OpA and OpB is an add with NSW/NUW and with
+ // the same LHS operand, we can guarantee that the transformation is safe
+ // if we can prove that OpA won't overflow when IdxDiff added to the RHS
+ // of OpA.
+ // For example:
+ // %tmp7 = add nsw i32 %tmp2, %v0
+ // %tmp8 = sext i32 %tmp7 to i64
+ // ...
+ // %tmp11 = add nsw i32 %v0, 1
+ // %tmp12 = add nsw i32 %tmp2, %tmp11
+ // %tmp13 = sext i32 %tmp12 to i64
+ //
+ // Both %tmp7 and %tmp2 has the nsw flag and the first operand
+ // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
+ // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
+ // nsw flag.
+ OpA = dyn_cast<Instruction>(ValA);
+ if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
+ OpB->getOpcode() == Instruction::Add &&
+ OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) &&
+ CheckFlags(OpB, Signed)) {
+ Value *RHSA = OpA->getOperand(1);
+ Value *RHSB = OpB->getOperand(1);
+ Instruction *OpRHSA = dyn_cast<Instruction>(RHSA);
+ Instruction *OpRHSB = dyn_cast<Instruction>(RHSB);
+ // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
+ if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add &&
+ CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) {
+ int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+ if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal)
+ Safe = true;
+ }
+ // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
+ if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add &&
+ CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) {
+ int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+ if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal)
+ Safe = true;
+ }
+ // Match `x +nsw/nuw (y +nsw/nuw c)` and
+ // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
+ if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add &&
+ OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) &&
+ CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) &&
+ isa<ConstantInt>(OpRHSB->getOperand(1))) {
+ int64_t CstValA =
+ cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+ int64_t CstValB =
+ cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+ if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) &&
+ IdxDiff.getSExtValue() == (CstValB - CstValA))
+ Safe = true;
+ }
}
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
- // Second attempt:
+ // Third attempt:
// If all set bits of IdxDiff or any higher order bit other than the sign bit
// are known to be zero in ValA, we can add Diff to it while guaranteeing no
// overflow of any sort.
@@ -503,7 +544,6 @@ bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
}
void Vectorizer::reorder(Instruction *I) {
- OrderedBasicBlock OBB(I->getParent());
SmallPtrSet<Instruction *, 16> InstructionsToMove;
SmallVector<Instruction *, 16> Worklist;
@@ -521,7 +561,7 @@ void Vectorizer::reorder(Instruction *I) {
if (IM->getParent() != I->getParent())
continue;
- if (!OBB.dominates(IM, I)) {
+ if (!IM->comesBefore(I)) {
InstructionsToMove.insert(IM);
Worklist.push_back(IM);
}
@@ -637,8 +677,6 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
}
}
- OrderedBasicBlock OBB(Chain[0]->getParent());
-
// Loop until we find an instruction in ChainInstrs that we can't vectorize.
unsigned ChainInstrIdx = 0;
Instruction *BarrierMemoryInstr = nullptr;
@@ -648,14 +686,14 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
// If a barrier memory instruction was found, chain instructions that follow
// will not be added to the valid prefix.
- if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
+ if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
break;
// Check (in BB order) if any instruction prevents ChainInstr from being
// vectorized. Find and store the first such "conflicting" instruction.
for (Instruction *MemInstr : MemoryInstrs) {
// If a barrier memory instruction was found, do not check past it.
- if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
+ if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
break;
auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
@@ -674,12 +712,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
// vectorize it (the vectorized load is inserted at the location of the
// first load in the chain).
if (isa<StoreInst>(MemInstr) && ChainLoad &&
- (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
+ (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
continue;
// Same case, but in reverse.
if (MemLoad && isa<StoreInst>(ChainInstr) &&
- (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
+ (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
continue;
if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
@@ -705,7 +743,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
// the basic block.
if (IsLoadChain && BarrierMemoryInstr) {
// The BarrierMemoryInstr is a store that precedes ChainInstr.
- assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
+ assert(BarrierMemoryInstr->comesBefore(ChainInstr));
break;
}
}
@@ -961,7 +999,7 @@ bool Vectorizer::vectorizeStoreChain(
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
- unsigned Alignment = getAlignment(S0);
+ Align Alignment = S0->getAlign();
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
@@ -992,10 +1030,10 @@ bool Vectorizer::vectorizeStoreChain(
VectorType *VecTy;
VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
if (VecStoreTy)
- VecTy = VectorType::get(StoreTy->getScalarType(),
- Chain.size() * VecStoreTy->getNumElements());
+ VecTy = FixedVectorType::get(StoreTy->getScalarType(),
+ Chain.size() * VecStoreTy->getNumElements());
else
- VecTy = VectorType::get(StoreTy, Chain.size());
+ VecTy = FixedVectorType::get(StoreTy, Chain.size());
// If it's more than the max vector size or the target has a better
// vector factor, break it into two pieces.
@@ -1019,18 +1057,20 @@ bool Vectorizer::vectorizeStoreChain(
InstructionsProcessed->insert(Chain.begin(), Chain.end());
// If the store is going to be misaligned, don't vectorize it.
- if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+ if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
vectorizeStoreChain(Chains.second, InstructionsProcessed);
}
- unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
- StackAdjustedAlignment,
- DL, S0, nullptr, &DT);
- if (NewAlign != 0)
+ Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+ Align(StackAdjustedAlignment),
+ DL, S0, nullptr, &DT);
+ if (NewAlign >= Alignment)
Alignment = NewAlign;
+ else
+ return false;
}
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
@@ -1112,7 +1152,7 @@ bool Vectorizer::vectorizeLoadChain(
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
- unsigned Alignment = getAlignment(L0);
+ Align Alignment = L0->getAlign();
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
@@ -1142,10 +1182,10 @@ bool Vectorizer::vectorizeLoadChain(
VectorType *VecTy;
VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
if (VecLoadTy)
- VecTy = VectorType::get(LoadTy->getScalarType(),
- Chain.size() * VecLoadTy->getNumElements());
+ VecTy = FixedVectorType::get(LoadTy->getScalarType(),
+ Chain.size() * VecLoadTy->getNumElements());
else
- VecTy = VectorType::get(LoadTy, Chain.size());
+ VecTy = FixedVectorType::get(LoadTy, Chain.size());
// If it's more than the max vector size or the target has a better
// vector factor, break it into two pieces.
@@ -1162,15 +1202,20 @@ bool Vectorizer::vectorizeLoadChain(
InstructionsProcessed->insert(Chain.begin(), Chain.end());
// If the load is going to be misaligned, don't vectorize it.
- if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+ if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
vectorizeLoadChain(Chains.second, InstructionsProcessed);
}
- Alignment = getOrEnforceKnownAlignment(
- L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
+ Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
+ Align(StackAdjustedAlignment),
+ DL, L0, nullptr, &DT);
+ if (NewAlign >= Alignment)
+ Alignment = NewAlign;
+ else
+ return false;
}
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
@@ -1194,7 +1239,8 @@ bool Vectorizer::vectorizeLoadChain(
Value *Bitcast =
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
- LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+ LoadInst *LI =
+ Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
propagateMetadata(LI, Chain);
if (VecLoadTy) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3f943f4c0688e..23613775d896d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,14 +13,17 @@
// pass. It should be easy to create an analysis pass around it if there
// is a need (but D45420 needs to happen first).
//
-#include "llvm/Transforms/Vectorize/LoopVectorize.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
using namespace llvm;
+using namespace PatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
@@ -566,6 +569,28 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() {
return false;
}
+/// Checks if a function is scalarizable according to the TLI, in
+/// the sense that it should be vectorized and then expanded in
+/// multiple scalarcalls. This is represented in the
+/// TLI via mappings that do not specify a vector name, as in the
+/// following example:
+///
+/// const VecDesc VecIntrinsics[] = {
+/// {"llvm.phx.abs.i32", "", 4}
+/// };
+static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
+ const StringRef ScalarName = CI.getCalledFunction()->getName();
+ bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
+ // Check that all known VFs are not associated to a vector
+ // function, i.e. the vector name is emty.
+ if (Scalarize)
+ for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName);
+ VF <= WidestVF; VF *= 2) {
+ Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
+ }
+ return Scalarize;
+}
+
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *Header = TheLoop->getHeader();
@@ -644,6 +669,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
SinkAfter, DT)) {
+ AllowedExit.insert(Phi);
FirstOrderRecurrences.insert(Phi);
continue;
}
@@ -667,10 +693,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// * Have a mapping to an IR intrinsic.
// * Have a vector version available.
auto *CI = dyn_cast<CallInst>(&I);
+
if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
!isa<DbgInfoIntrinsic>(CI) &&
!(CI->getCalledFunction() && TLI &&
- TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+ (!VFDatabase::getMappings(*CI).empty() ||
+ isTLIScalarize(*TLI, *CI)))) {
// If the call is a recognized math libary call, it is likely that
// we can vectorize it given loosened floating-point constraints.
LibFunc Func;
@@ -685,7 +713,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// but it's hard to provide meaningful yet generic advice.
// Also, should this be guarded by allowExtraAnalysis() and/or be part
// of the returned info from isFunctionVectorizable()?
- reportVectorizationFailure("Found a non-intrinsic callsite",
+ reportVectorizationFailure(
+ "Found a non-intrinsic callsite",
"library call cannot be vectorized. "
"Try compiling with -fno-math-errno, -ffast-math, "
"or similar flags",
@@ -739,11 +768,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// supported on the target.
if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
// Arbitrarily try a vector of 2 elements.
- Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+ auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2);
assert(VecTy && "did not find vectorized version of stored type");
- const MaybeAlign Alignment = getLoadStoreAlignment(ST);
- assert(Alignment && "Alignment should be set");
- if (!TTI->isLegalNTStore(VecTy, *Alignment)) {
+ if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
reportVectorizationFailure(
"nontemporal store instruction cannot be vectorized",
"nontemporal store instruction cannot be vectorized",
@@ -756,11 +783,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
// For nontemporal loads, check that a nontemporal vector version is
// supported on the target (arbitrarily try a vector of 2 elements).
- Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+ auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2);
assert(VecTy && "did not find vectorized version of load type");
- const MaybeAlign Alignment = getLoadStoreAlignment(LD);
- assert(Alignment && "Alignment should be set");
- if (!TTI->isLegalNTLoad(VecTy, *Alignment)) {
+ if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
reportVectorizationFailure(
"nontemporal load instruction cannot be vectorized",
"nontemporal load instruction cannot be vectorized",
@@ -897,6 +922,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(
if (C->canTrap())
return false;
}
+
+ // We can predicate blocks with calls to assume, as long as we drop them in
+ // case we flatten the CFG via predication.
+ if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
+ ConditionalAssumes.insert(&I);
+ continue;
+ }
+
// We might be able to hoist the load.
if (I.mayReadFromMemory()) {
auto *LI = dyn_cast<LoadInst>(&I);
@@ -947,14 +980,14 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
// the memory pointed to can be dereferenced (with the access size implied by
// the value's type) unconditionally within the loop header without
// introducing a new fault.
- SmallPtrSet<Value *, 8> SafePointes;
+ SmallPtrSet<Value *, 8> SafePointers;
// Collect safe addresses.
for (BasicBlock *BB : TheLoop->blocks()) {
if (!blockNeedsPredication(BB)) {
for (Instruction &I : *BB)
if (auto *Ptr = getLoadStorePointerOperand(&I))
- SafePointes.insert(Ptr);
+ SafePointers.insert(Ptr);
continue;
}
@@ -968,7 +1001,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
LoadInst *LI = dyn_cast<LoadInst>(&I);
if (LI && !mustSuppressSpeculation(*LI) &&
isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
- SafePointes.insert(LI->getPointerOperand());
+ SafePointers.insert(LI->getPointerOperand());
}
}
@@ -986,7 +1019,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
// We must be able to predicate all blocks that need to be predicated.
if (blockNeedsPredication(BB)) {
- if (!blockCanBePredicated(BB, SafePointes)) {
+ if (!blockCanBePredicated(BB, SafePointers)) {
reportVectorizationFailure(
"Control flow cannot be substituted for a select",
"control flow cannot be substituted for a select",
@@ -1198,18 +1231,9 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
- if (!PrimaryInduction) {
- reportVectorizationFailure(
- "No primary induction, cannot fold tail by masking",
- "Missing a primary induction variable in the loop, which is "
- "needed in order to fold tail by masking as required.",
- "NoPrimaryInduction", ORE, TheLoop);
- return false;
- }
-
SmallPtrSet<const Value *, 8> ReductionLiveOuts;
- for (auto &Reduction : *getReductionVars())
+ for (auto &Reduction : getReductionVars())
ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
// TODO: handle non-reduction outside users when tail is folded by masking.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c3ca43fcd4927..8dd06983cd84d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -31,9 +31,12 @@
namespace llvm {
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class PredicatedScalarEvolution;
+
/// VPlan-based builder utility analogous to IRBuilder.
class VPBuilder {
-private:
VPBasicBlock *BB = nullptr;
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
@@ -204,6 +207,8 @@ class LoopVectorizationPlanner {
/// The interleaved access analysis.
InterleavedAccessInfo &IAI;
+ PredicatedScalarEvolution &PSE;
+
SmallVector<VPlanPtr, 4> VPlans;
/// This class is used to enable the VPlan to invoke a method of ILV. This is
@@ -229,13 +234,14 @@ public:
const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM,
- InterleavedAccessInfo &IAI)
- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
- IAI(IAI) {}
+ InterleavedAccessInfo &IAI,
+ PredicatedScalarEvolution &PSE)
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
+ PSE(PSE) {}
/// Plan how to best vectorize, return the best VF and its cost, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<VectorizationFactor> plan(unsigned UserVF);
+ Optional<VectorizationFactor> plan(unsigned UserVF, unsigned UserIC);
/// Use the VPlan-native path to plan how to best vectorize, return the best
/// VF and its cost.
@@ -279,9 +285,10 @@ private:
/// Build a VPlan using VPRecipes according to the information gather by
/// Legal. This method is only used for the legacy inner loop vectorizer.
- VPlanPtr
- buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
+ VPlanPtr buildVPlanWithVPRecipes(
+ VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ const DenseMap<Instruction *, Instruction *> &SinkAfter);
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 684a3098e5645..35af8e4257789 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -91,7 +91,6 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
@@ -134,9 +133,11 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
@@ -294,15 +295,6 @@ cl::opt<bool> llvm::EnableLoopVectorization(
"vectorize-loops", cl::init(true), cl::Hidden,
cl::desc("Run the Loop vectorization passes"));
-/// A helper function for converting Scalar types to vector types.
-/// If the incoming type is void, we return void. If the VF is 1, we return
-/// the scalar type.
-static Type *ToVectorTy(Type *Scalar, unsigned VF) {
- if (Scalar->isVoidTy() || VF == 1)
- return Scalar;
- return VectorType::get(Scalar, VF);
-}
-
/// A helper function that returns the type of loaded or stored value.
static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -319,7 +311,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
if (VF > 1) {
- auto *VectorTy = VectorType::get(Ty, VF);
+ auto *VectorTy = FixedVectorType::get(Ty, VF);
return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
}
@@ -415,7 +407,16 @@ public:
BasicBlock *createVectorizedLoopSkeleton();
/// Widen a single instruction within the innermost loop.
- void widenInstruction(Instruction &I);
+ void widenInstruction(Instruction &I, VPUser &Operands,
+ VPTransformState &State);
+
+ /// Widen a single call instruction within the innermost loop.
+ void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+ VPTransformState &State);
+
+ /// Widen a single select instruction within the innermost loop.
+ void widenSelectInstruction(SelectInst &I, VPUser &Operands,
+ bool InvariantCond, VPTransformState &State);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();
@@ -430,8 +431,9 @@ public:
/// Vectorize a single GetElementPtrInst based on information gathered and
/// decisions taken during planning.
- void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
- bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
+ void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
+ unsigned VF, bool IsPtrLoopInvariant,
+ SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
@@ -441,9 +443,11 @@ public:
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
- /// inclusive..
- void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
- bool IfPredicateInstr);
+ /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+ /// Instr's operands.
+ void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
+ const VPIteration &Instance, bool IfPredicateInstr,
+ VPTransformState &State);
/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
@@ -482,20 +486,21 @@ public:
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
- /// Try to vectorize the interleaved access group that \p Instr belongs to
- /// with the base address given in \p Addr, optionally masking the vector
- /// operations if \p BlockInMask is non-null. Use \p State to translate given
- /// VPValues to IR values in the vectorized loop.
- void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
- VPValue *Addr, VPValue *BlockInMask = nullptr);
+ /// Try to vectorize interleaved access group \p Group with the base address
+ /// given in \p Addr, optionally masking the vector operations if \p
+ /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
+ /// values in the vectorized loop.
+ void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
+ VPTransformState &State, VPValue *Addr,
+ VPValue *BlockInMask = nullptr);
/// Vectorize Load and Store instructions with the base address given in \p
/// Addr, optionally masking the vector operations if \p BlockInMask is
/// non-null. Use \p State to translate given VPValues to IR values in the
/// vectorized loop.
void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
- VPValue *Addr,
- VPValue *BlockInMask = nullptr);
+ VPValue *Addr, VPValue *StoredValue,
+ VPValue *BlockInMask);
/// Set the debug location in the builder using the debug location in
/// the instruction.
@@ -682,7 +687,7 @@ protected:
DominatorTree *DT;
/// Alias Analysis.
- AliasAnalysis *AA;
+ AAResults *AA;
/// Target Library Info.
const TargetLibraryInfo *TLI;
@@ -974,7 +979,7 @@ public:
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
- Optional<unsigned> computeMaxVF();
+ Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
@@ -1066,7 +1071,7 @@ public:
auto UniformsPerVF = Uniforms.find(VF);
assert(UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity");
- return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
+ return UniformsPerVF->second.count(I);
}
/// Returns true if \p I is known to be scalar after vectorization.
@@ -1082,7 +1087,7 @@ public:
auto ScalarsPerVF = Scalars.find(VF);
assert(ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF");
- return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
+ return ScalarsPerVF->second.count(I);
}
/// \returns True if instruction \p I can be truncated to a smaller bitwidth
@@ -1200,27 +1205,27 @@ public:
/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
return Legal->isConsecutivePtr(Ptr) &&
TTI.isLegalMaskedStore(DataType, Alignment);
}
/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
return Legal->isConsecutivePtr(Ptr) &&
TTI.isLegalMaskedLoad(DataType, Alignment);
}
/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+ bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
return TTI.isLegalMaskedScatter(DataType, Alignment);
}
/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
+ bool isLegalMaskedGather(Type *DataType, Align Alignment) {
return TTI.isLegalMaskedGather(DataType, Alignment);
}
@@ -1232,7 +1237,7 @@ public:
if (!LI && !SI)
return false;
auto *Ty = getMemInstValueType(V);
- MaybeAlign Align = getLoadStoreAlignment(V);
+ Align Align = getLoadStoreAlignment(V);
return (LI && isLegalMaskedGather(Ty, Align)) ||
(SI && isLegalMaskedScatter(Ty, Align));
}
@@ -1309,11 +1314,19 @@ public:
/// i.e. either vector version isn't available, or is too expensive.
unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
+ /// Invalidates decisions already taken by the cost model.
+ void invalidateCostModelingDecisions() {
+ WideningDecisions.clear();
+ Uniforms.clear();
+ Scalars.clear();
+ }
+
private:
unsigned NumPredStores = 0;
- /// \return An upper bound for the vectorization factor, larger than zero.
- /// One is returned if vectorization should best be avoided due to cost.
+ /// \return An upper bound for the vectorization factor, a power-of-2 larger
+ /// than zero. One is returned if vectorization should best be avoided due
+ /// to cost.
unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
/// The vectorization cost is a combination of the cost itself and a boolean
@@ -1598,9 +1611,8 @@ struct LoopVectorize : public FunctionPass {
explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
bool VectorizeOnlyWhenForced = false)
- : FunctionPass(ID) {
- Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
- Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
+ : FunctionPass(ID),
+ Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
@@ -1626,7 +1638,7 @@ struct LoopVectorize : public FunctionPass {
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
- GetLAA, *ORE, PSI);
+ GetLAA, *ORE, PSI).MadeAnyChange;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1640,6 +1652,7 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<LoopAccessLegacyAnalysis>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<InjectTLIMappingsLegacy>();
// We currently do not preserve loopinfo/dominator analyses with outer loop
// vectorization. Until this is addressed, mark these analyses as preserved
@@ -1724,9 +1737,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
// FIXME: If the step is non-constant, we create the vector splat with
// IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
// handle a constant vector splat.
- Value *SplatVF = isa<Constant>(Mul)
- ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
- : Builder.CreateVectorSplat(VF, Mul);
+ Value *SplatVF =
+ isa<Constant>(Mul)
+ ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
+ : Builder.CreateVectorSplat(VF, Mul);
Builder.restoreIP(CurrIP);
// We may need to add the step a number of times, depending on the unroll
@@ -1806,57 +1820,37 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
"Primary induction variable must have an integer type");
- auto II = Legal->getInductionVars()->find(IV);
- assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
+ auto II = Legal->getInductionVars().find(IV);
+ assert(II != Legal->getInductionVars().end() && "IV is not an induction");
auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
- // The scalar value to broadcast. This will be derived from the canonical
- // induction variable.
- Value *ScalarIV = nullptr;
-
// The value from the original loop to which we are mapping the new induction
// variable.
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
- // True if we have vectorized the induction variable.
- auto VectorizedIV = false;
-
- // Determine if we want a scalar version of the induction variable. This is
- // true if the induction variable itself is not widened, or if it has at
- // least one user in the loop that is not widened.
- auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
+ auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
// Generate code for the induction step. Note that induction steps are
// required to be loop-invariant
- assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
- "Induction step should be loop invariant");
- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
- Value *Step = nullptr;
- if (PSE.getSE()->isSCEVable(IV->getType())) {
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
- LoopVectorPreHeader->getTerminator());
- } else {
- Step = cast<SCEVUnknown>(ID.getStep())->getValue();
- }
-
- // Try to create a new independent vector induction variable. If we can't
- // create the phi node, we will splat the scalar induction variable in each
- // loop iteration.
- if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
- createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
- VectorizedIV = true;
- }
+ auto CreateStepValue = [&](const SCEV *Step) -> Value * {
+ assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
+ "Induction step should be loop invariant");
+ if (PSE.getSE()->isSCEVable(IV->getType())) {
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+ return Exp.expandCodeFor(Step, Step->getType(),
+ LoopVectorPreHeader->getTerminator());
+ }
+ return cast<SCEVUnknown>(Step)->getValue();
+ };
- // If we haven't yet vectorized the induction variable, or if we will create
- // a scalar one, we need to define the scalar induction variable and step
- // values. If we were given a truncation type, truncate the canonical
+ // The scalar value to broadcast. This is derived from the canonical
+ // induction variable. If a truncation type is given, truncate the canonical
// induction variable and step. Otherwise, derive these values from the
// induction descriptor.
- if (!VectorizedIV || NeedsScalarIV) {
- ScalarIV = Induction;
+ auto CreateScalarIV = [&](Value *&Step) -> Value * {
+ Value *ScalarIV = Induction;
if (IV != OldInduction) {
ScalarIV = IV->getType()->isIntegerTy()
? Builder.CreateSExtOrTrunc(Induction, IV->getType())
@@ -1872,12 +1866,12 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
Step = Builder.CreateTrunc(Step, TruncType);
}
- }
+ return ScalarIV;
+ };
- // If we haven't yet vectorized the induction variable, splat the scalar
- // induction variable, and build the necessary step vectors.
- // TODO: Don't do it unless the vectorized IV is really required.
- if (!VectorizedIV) {
+ // Create the vector values from the scalar IV, in the absence of creating a
+ // vector IV.
+ auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart =
@@ -1887,23 +1881,53 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
addMetadata(EntryPart, Trunc);
recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
}
+ };
+
+ // Now do the actual transformations, and start with creating the step value.
+ Value *Step = CreateStepValue(ID.getStep());
+ if (VF <= 1) {
+ Value *ScalarIV = CreateScalarIV(Step);
+ CreateSplatIV(ScalarIV, Step);
+ return;
+ }
+
+ // Determine if we want a scalar version of the induction variable. This is
+ // true if the induction variable itself is not widened, or if it has at
+ // least one user in the loop that is not widened.
+ auto NeedsScalarIV = needsScalarInduction(EntryVal);
+ if (!NeedsScalarIV) {
+ createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ return;
}
- // If an induction variable is only used for counting loop iterations or
- // calculating addresses, it doesn't need to be widened. Create scalar steps
- // that can be used by instructions we will later scalarize. Note that the
- // addition of the scalar steps will not increase the number of instructions
- // in the loop in the common case prior to InstCombine. We will be trading
- // one vector extract for each scalar step.
- if (NeedsScalarIV)
+ // Try to create a new independent vector induction variable. If we can't
+ // create the phi node, we will splat the scalar induction variable in each
+ // loop iteration.
+ if (!shouldScalarizeInstruction(EntryVal)) {
+ createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+ Value *ScalarIV = CreateScalarIV(Step);
+ // Create scalar steps that can be used by instructions we will later
+ // scalarize. Note that the addition of the scalar steps will not increase
+ // the number of instructions in the loop in the common case prior to
+ // InstCombine. We will be trading one vector extract for each scalar step.
buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+ return;
+ }
+
+ // All IV users are scalar instructions, so only emit a scalar IV, not a
+ // vectorised IV. Except when we tail-fold, then the splat IV feeds the
+ // predicate used by the masked loads/stores.
+ Value *ScalarIV = CreateScalarIV(Step);
+ if (!Cost->isScalarEpilogueAllowed())
+ CreateSplatIV(ScalarIV, Step);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID);
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps BinOp) {
// Create and check the types.
- assert(Val->getType()->isVectorTy() && "Must be a vector");
- int VLen = Val->getType()->getVectorNumElements();
+ auto *ValVTy = cast<VectorType>(Val->getType());
+ int VLen = ValVTy->getNumElements();
Type *STy = Val->getType()->getScalarType();
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
@@ -2052,7 +2076,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
} else {
// Initialize packing with insertelements to start from undef.
- Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+ Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
VectorLoopValueMap.setVectorValue(V, Part, Undef);
for (unsigned Lane = 0; Lane < VF; ++Lane)
packScalarIntoVectorValue(V, {Part, Lane});
@@ -2118,13 +2142,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
- SmallVector<Constant *, 8> ShuffleMask;
+ SmallVector<int, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
- ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+ ShuffleMask.push_back(VF - i - 1);
return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
- ConstantVector::get(ShuffleMask),
- "reverse");
+ ShuffleMask, "reverse");
}
// Return whether we allow using masked interleave-groups (for dealing with
@@ -2166,24 +2189,16 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
- VPTransformState &State,
- VPValue *Addr,
- VPValue *BlockInMask) {
- const InterleaveGroup<Instruction> *Group =
- Cost->getInterleavedAccessGroup(Instr);
- assert(Group && "Fail to get an interleaved access group.");
-
- // Skip if current instruction is not the insert position.
- if (Instr != Group->getInsertPos())
- return;
-
+void InnerLoopVectorizer::vectorizeInterleaveGroup(
+ const InterleaveGroup<Instruction> *Group, VPTransformState &State,
+ VPValue *Addr, VPValue *BlockInMask) {
+ Instruction *Instr = Group->getInsertPos();
const DataLayout &DL = Instr->getModule()->getDataLayout();
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
- Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
+ auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
// Prepare for the new pointers.
SmallVector<Value *, 2> AddrParts;
@@ -2252,21 +2267,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
- auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
+ BlockInMaskPart, Undefs,
+ createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
GroupMask = MaskForGaps
? Builder.CreateBinOp(Instruction::And, ShuffledMask,
MaskForGaps)
: ShuffledMask;
}
NewLoad =
- Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
+ Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
GroupMask, UndefVec, "wide.masked.vec");
}
else
NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
- Group->getAlignment(), "wide.vec");
+ Group->getAlign(), "wide.vec");
Group->addMetadata(NewLoad);
NewLoads.push_back(NewLoad);
}
@@ -2280,14 +2295,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
if (!Member)
continue;
- Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
+ auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
for (unsigned Part = 0; Part < UF; Part++) {
Value *StridedVec = Builder.CreateShuffleVector(
NewLoads[Part], UndefVec, StrideMask, "strided.vec");
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
- VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+ VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
}
@@ -2301,7 +2316,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
}
// The sub vector type for current instruction.
- VectorType *SubVT = VectorType::get(ScalarTy, VF);
+ auto *SubVT = FixedVectorType::get(ScalarTy, VF);
// Vectorize the interleaved store group.
for (unsigned Part = 0; Part < UF; Part++) {
@@ -2329,23 +2344,23 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
Value *WideVec = concatenateVectors(Builder, StoredVecs);
// Interleave the elements in the wide vector.
- Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
- Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
- "interleaved.vec");
+ Value *IVec = Builder.CreateShuffleVector(
+ WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
+ "interleaved.vec");
Instruction *NewStoreInstr;
if (BlockInMask) {
Value *BlockInMaskPart = State.get(BlockInMask, Part);
auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
- auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
Value *ShuffledMask = Builder.CreateShuffleVector(
- BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
+ BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
+ "interleaved.mask");
NewStoreInstr = Builder.CreateMaskedStore(
- IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
+ IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
}
else
- NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
- Group->getAlignment());
+ NewStoreInstr =
+ Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
Group->addMetadata(NewStoreInstr);
}
@@ -2354,27 +2369,26 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
VPTransformState &State,
VPValue *Addr,
+ VPValue *StoredValue,
VPValue *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
assert((LI || SI) && "Invalid Load/Store instruction");
+ assert((!SI || StoredValue) && "No stored value provided for widened store");
+ assert((!LI || !StoredValue) && "Stored value provided for widened load");
LoopVectorizationCostModel::InstWidening Decision =
Cost->getWideningDecision(Instr, VF);
- assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
- "CM decision should be taken at this point");
- if (Decision == LoopVectorizationCostModel::CM_Interleave)
- return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
+ assert((Decision == LoopVectorizationCostModel::CM_Widen ||
+ Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
+ Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
+ "CM decision is not to widen the memory instruction");
Type *ScalarDataTy = getMemInstValueType(Instr);
- Type *DataTy = VectorType::get(ScalarDataTy, VF);
- // An alignment of 0 means target abi alignment. We need to use the scalar's
- // target abi alignment in such a case.
- const DataLayout &DL = Instr->getModule()->getDataLayout();
- const Align Alignment =
- DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
+ auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
+ const Align Alignment = getLoadStoreAlignment(Instr);
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -2431,12 +2445,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
for (unsigned Part = 0; Part < UF; ++Part) {
Instruction *NewSI = nullptr;
- Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
+ Value *StoredVal = State.get(StoredValue, Part);
if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(Addr, Part);
- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
- Alignment.value(), MaskPart);
+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+ MaskPart);
} else {
if (Reverse) {
// If we store to reverse consecutive memory locations, then we need
@@ -2447,11 +2461,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
}
auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(
- StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+ BlockInMaskParts[Part]);
else
- NewSI =
- Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
+ NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
}
addMetadata(NewSI, SI);
}
@@ -2466,18 +2479,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
if (CreateGatherScatter) {
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
Value *VectorGep = State.get(Addr, Part);
- NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
+ NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
nullptr, "wide.masked.gather");
addMetadata(NewLI, LI);
} else {
auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
if (isMaskRequired)
NewLI = Builder.CreateMaskedLoad(
- VecPtr, Alignment.value(), BlockInMaskParts[Part],
- UndefValue::get(DataTy), "wide.masked.load");
+ VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
+ "wide.masked.load");
else
- NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
- "wide.load");
+ NewLI =
+ Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
// Add metadata to the load, but setVectorValue to the reverse shuffle.
addMetadata(NewLI, LI);
@@ -2488,9 +2501,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
const VPIteration &Instance,
- bool IfPredicateInstr) {
+ bool IfPredicateInstr,
+ VPTransformState &State) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
setDebugLocFromInst(Builder, Instr);
@@ -2504,8 +2518,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
+ for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+ auto *NewOp = State.get(User.getOperand(op), Instance);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -2578,7 +2592,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
// compare. The only way that we get a backedge taken count is that the
// induction variable was signed and as such will not overflow. In such a case
// truncation is legal.
- if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+ if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
IdxTy->getPrimitiveSizeInBits())
BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
@@ -2676,7 +2690,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
"Only one type should be a floating point type");
Type *IntTy =
IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
- VectorType *VecIntTy = VectorType::get(IntTy, VF);
+ auto *VecIntTy = FixedVectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
}
@@ -2774,12 +2788,17 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
+ auto *LAI = Legal->getLAI();
+ const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
+ if (!RtPtrChecking.Need)
+ return;
Instruction *FirstCheckInst;
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
- Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
- if (!MemRuntimeCheck)
- return;
+ addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
+ RtPtrChecking.getChecks(), RtPtrChecking.getSE());
+ assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+ "claimed checks are required");
if (MemCheckBlock->getParent()->hasOptSize()) {
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
@@ -2858,6 +2877,18 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateMul(X, Y);
};
+ // Get a suitable insert point for SCEV expansion. For blocks in the vector
+ // loop, choose the end of the vector loop header (=LoopVectorBody), because
+ // the DomTree is not kept up-to-date for additional blocks generated in the
+ // vector loop. By using the header as insertion point, we guarantee that the
+ // expanded instructions dominate all their uses.
+ auto GetInsertPoint = [this, &B]() {
+ BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
+ if (InsertBB != LoopVectorBody &&
+ LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
+ return LoopVectorBody->getTerminator();
+ return &*B.GetInsertPoint();
+ };
switch (ID.getKind()) {
case InductionDescriptor::IK_IntInduction: {
assert(Index->getType() == StartValue->getType() &&
@@ -2865,7 +2896,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
return B.CreateSub(StartValue, Index);
auto *Offset = CreateMul(
- Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
+ Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
return CreateAdd(StartValue, Offset);
}
case InductionDescriptor::IK_PtrInduction: {
@@ -2873,8 +2904,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
"Expected constant step for pointer induction");
return B.CreateGEP(
StartValue->getType()->getPointerElementType(), StartValue,
- CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
- &*B.GetInsertPoint())));
+ CreateMul(Index,
+ Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
}
case InductionDescriptor::IK_FpInduction: {
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
@@ -3034,8 +3065,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// This variable saves the new starting index for the scalar loop. It is used
// to test if there are any tail iterations left once the vector loop has
// completed.
- LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
- for (auto &InductionEntry : *List) {
+ for (auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
InductionDescriptor II = InductionEntry.second;
@@ -3258,7 +3288,6 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
unsigned VF,
bool &NeedToScalarize) {
Function *F = CI->getCalledFunction();
- StringRef FnName = CI->getCalledFunction()->getName();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->arg_operands())
@@ -3268,7 +3297,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
- unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+ unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
+ TTI::TCK_RecipThroughput);
if (VF == 1)
return ScalarCallCost;
@@ -3286,11 +3316,15 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
NeedToScalarize = true;
- if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+ VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!TLI || CI->isNoBuiltin() || !VecFunc)
return Cost;
// If the corresponding vector cost is cheaper, return its cost.
- unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+ unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
+ TTI::TCK_RecipThroughput);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
return VectorCallCost;
@@ -3303,22 +3337,20 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
- FastMathFlags FMF;
- if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
- FMF = FPMO->getFastMathFlags();
-
- SmallVector<Value *, 4> Operands(CI->arg_operands());
- return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
+ return TTI.getIntrinsicInstrCost(CostAttrs,
+ TargetTransformInfo::TCK_RecipThroughput);
}
static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
- auto *I1 = cast<IntegerType>(T1->getVectorElementType());
- auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+ auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+ auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
}
+
static Type *largestIntegerVectorType(Type *T1, Type *T2) {
- auto *I1 = cast<IntegerType>(T1->getVectorElementType());
- auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+ auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+ auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
}
@@ -3335,14 +3367,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
continue;
for (unsigned Part = 0; Part < UF; ++Part) {
Value *I = getOrCreateVectorValue(KV.first, Part);
- if (Erased.find(I) != Erased.end() || I->use_empty() ||
- !isa<Instruction>(I))
+ if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
continue;
Type *OriginalTy = I->getType();
Type *ScalarTruncatedTy =
IntegerType::get(OriginalTy->getContext(), KV.second);
- Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
- OriginalTy->getVectorNumElements());
+ auto *TruncatedTy = FixedVectorType::get(
+ ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
if (TruncatedTy == OriginalTy)
continue;
@@ -3392,27 +3423,35 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
break;
}
} else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
- auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+ auto Elements0 =
+ cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
- SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
- auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+ SI->getOperand(0),
+ FixedVectorType::get(ScalarTruncatedTy, Elements0));
+ auto Elements1 =
+ cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
auto *O1 = B.CreateZExtOrTrunc(
- SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
+ SI->getOperand(1),
+ FixedVectorType::get(ScalarTruncatedTy, Elements1));
- NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+ NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
} else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
// Don't do anything with the operands, just extend the result.
continue;
} else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
+ auto Elements =
+ cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
- IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+ IE->getOperand(0),
+ FixedVectorType::get(ScalarTruncatedTy, Elements));
auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
} else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
+ auto Elements =
+ cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
auto *O0 = B.CreateZExtOrTrunc(
- EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+ EE->getOperand(0),
+ FixedVectorType::get(ScalarTruncatedTy, Elements));
NewI = B.CreateExtractElement(O0, EE->getOperand(2));
} else {
// If we don't know what to do, be conservative and don't do anything.
@@ -3471,7 +3510,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
PSE.getSE()->forgetLoop(OrigLoop);
// Fix-up external users of the induction variables.
- for (auto &Entry : *Legal->getInductionVars())
+ for (auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
IVEndValues[Entry.first], LoopMiddleBlock);
@@ -3482,6 +3521,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
// Remove redundant induction instructions.
cse(LoopVectorBody);
+
+ // Set/update profile weights for the vector and remainder loops as original
+ // loop iterations are now distributed among them. Note that original loop
+ // represented by LoopScalarBody becomes remainder loop after vectorization.
+ //
+ // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+ // end up getting slightly roughened result but that should be OK since
+ // profile is not inherently precise anyway. Note also possible bypass of
+ // vector code caused by legality checks is ignored, assigning all the weight
+ // to the vector loop, optimistically.
+ setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
+ LI->getLoopFor(LoopVectorBody),
+ LI->getLoopFor(LoopScalarBody), VF * UF);
}
void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -3563,8 +3615,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
if (VF > 1) {
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
VectorInit = Builder.CreateInsertElement(
- UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
- Builder.getInt32(VF - 1), "vector.recur.init");
+ UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
+ VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
}
// We constructed a temporary phi node in the first phase of vectorization.
@@ -3605,10 +3657,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
- SmallVector<Constant *, 8> ShuffleMask(VF);
- ShuffleMask[0] = Builder.getInt32(VF - 1);
+ SmallVector<int, 8> ShuffleMask(VF);
+ ShuffleMask[0] = VF - 1;
for (unsigned I = 1; I < VF; ++I)
- ShuffleMask[I] = Builder.getInt32(I + VF - 1);
+ ShuffleMask[I] = I + VF - 1;
// The vector from which to take the initial value for the current iteration
// (actual or unrolled). Initially, this is the vector phi node.
@@ -3618,10 +3670,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
- auto *Shuffle =
- VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
- ConstantVector::get(ShuffleMask))
- : Incoming;
+ auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+ ShuffleMask)
+ : Incoming;
PhiPart->replaceAllUsesWith(Shuffle);
cast<Instruction>(PhiPart)->eraseFromParent();
VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
@@ -3684,7 +3735,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// Get it's reduction variable descriptor.
assert(Legal->isReductionVariable(Phi) &&
"Unable to find the reduction variable");
- RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+ RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
@@ -3725,7 +3776,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// incoming scalar reduction.
VectorStart = ReductionStartValue;
} else {
- Identity = ConstantVector::getSplat(VF, Iden);
+ Identity = ConstantVector::getSplat({VF, false}, Iden);
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
@@ -3787,7 +3838,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
- Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+ Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
Builder.SetInsertPoint(
LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
VectorParts RdxParts(UF);
@@ -4036,9 +4087,11 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
}
}
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
- unsigned VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant) {
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
+ unsigned UF, unsigned VF,
+ bool IsPtrLoopInvariant,
+ SmallBitVector &IsIndexLoopInvariant,
+ VPTransformState &State) {
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
// results in a vector of pointers when at least one operand of the GEP
@@ -4075,19 +4128,18 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
for (unsigned Part = 0; Part < UF; ++Part) {
// The pointer operand of the new GEP. If it's loop-invariant, we
// won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant
- ? GEP->getPointerOperand()
- : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+ auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
+ : State.get(Operands.getOperand(0), Part);
// Collect all the indices for the new GEP. If any index is
// loop-invariant, we won't broadcast it.
SmallVector<Value *, 4> Indices;
- for (auto Index : enumerate(GEP->indices())) {
- Value *User = Index.value().get();
- if (IsIndexLoopInvariant[Index.index()])
- Indices.push_back(User);
+ for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
+ VPValue *Operand = Operands.getOperand(I);
+ if (IsIndexLoopInvariant[I - 1])
+ Indices.push_back(State.get(Operand, {0, 0}));
else
- Indices.push_back(getOrCreateVectorValue(User, Part));
+ Indices.push_back(State.get(Operand, Part));
}
// Create the new GEP. Note that this GEP may be a scalar if VF == 1,
@@ -4114,7 +4166,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// Create a vector phi with no operands - the vector phi operands will be
// set at the end of vector code generation.
Type *VecTy =
- (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+ (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
OrigPHIsToFix.push_back(P);
@@ -4133,7 +4185,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
for (unsigned Part = 0; Part < UF; ++Part) {
// This is phase one of vectorizing PHIs.
Type *VecTy =
- (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+ (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
Value *EntryPart = PHINode::Create(
VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
@@ -4145,9 +4197,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
// This PHINode must be an induction variable.
// Make sure that we know about it.
- assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
+ assert(Legal->getInductionVars().count(P) && "Not an induction variable");
- InductionDescriptor II = Legal->getInductionVars()->lookup(P);
+ InductionDescriptor II = Legal->getInductionVars().lookup(P);
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
// FIXME: The newly created binary instructions should contain nsw/nuw flags,
@@ -4203,11 +4255,14 @@ static bool mayDivideByZero(Instruction &I) {
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::widenInstruction(Instruction &I) {
+void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
+ VPTransformState &State) {
switch (I.getOpcode()) {
+ case Instruction::Call:
case Instruction::Br:
case Instruction::PHI:
case Instruction::GetElementPtr:
+ case Instruction::Select:
llvm_unreachable("This instruction is handled by a different recipe.");
case Instruction::UDiv:
case Instruction::SDiv:
@@ -4233,8 +4288,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 2> Ops;
- for (Value *Op : I.operands())
- Ops.push_back(getOrCreateVectorValue(Op, Part));
+ for (VPValue *VPOp : User.operands())
+ Ops.push_back(State.get(VPOp, Part));
Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
@@ -4248,35 +4303,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
break;
}
- case Instruction::Select: {
- // Widen selects.
- // If the selector is loop invariant we can create a select
- // instruction with a scalar condition. Otherwise, use vector-select.
- auto *SE = PSE.getSE();
- bool InvariantCond =
- SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
- setDebugLocFromInst(Builder, &I);
-
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
-
- auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
- Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
- Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
- Value *Sel =
- Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
- VectorLoopValueMap.setVectorValue(&I, Part, Sel);
- addMetadata(Sel, &I);
- }
-
- break;
- }
-
case Instruction::ICmp:
case Instruction::FCmp: {
// Widen compares. Generate vector compares.
@@ -4284,8 +4310,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
auto *Cmp = cast<CmpInst>(&I);
setDebugLocFromInst(Builder, Cmp);
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
- Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
+ Value *A = State.get(User.getOperand(0), Part);
+ Value *B = State.get(User.getOperand(1), Part);
Value *C = nullptr;
if (FCmp) {
// Propagate fast math flags.
@@ -4319,78 +4345,80 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
/// Vectorize casts.
Type *DestTy =
- (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+ (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+ Value *A = State.get(User.getOperand(0), Part);
Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
VectorLoopValueMap.setVectorValue(&I, Part, Cast);
addMetadata(Cast, &I);
}
break;
}
+ default:
+ // This instruction is not vectorized by simple widening.
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
+ } // end of switch.
+}
- case Instruction::Call: {
- // Ignore dbg intrinsics.
- if (isa<DbgInfoIntrinsic>(I))
- break;
- setDebugLocFromInst(Builder, &I);
-
- Module *M = I.getParent()->getParent()->getParent();
- auto *CI = cast<CallInst>(&I);
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+ VPTransformState &State) {
+ assert(!isa<DbgInfoIntrinsic>(I) &&
+ "DbgInfoIntrinsic should have been dropped during VPlan construction");
+ setDebugLocFromInst(Builder, &I);
- StringRef FnName = CI->getCalledFunction()->getName();
- Function *F = CI->getCalledFunction();
- Type *RetTy = ToVectorTy(CI->getType(), VF);
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI->arg_operands())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+ Module *M = I.getParent()->getParent()->getParent();
+ auto *CI = cast<CallInst>(&I);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ SmallVector<Type *, 4> Tys;
+ for (Value *ArgOperand : CI->arg_operands())
+ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize;
- unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
- assert((UseVectorIntrinsic || !NeedToScalarize) &&
- "Instruction should be scalarized elsewhere.");
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Value *, 4> Args;
- for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
- Value *Arg = CI->getArgOperand(i);
- // Some intrinsics have a scalar argument - don't replace it with a
- // vector.
- if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
- Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
- Args.push_back(Arg);
- }
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize = false;
+ unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+ assert((UseVectorIntrinsic || !NeedToScalarize) &&
+ "Instruction should be scalarized elsewhere.");
- Function *VectorF;
- if (UseVectorIntrinsic) {
- // Use vector version of the intrinsic.
- Type *TysForDecl[] = {CI->getType()};
- if (VF > 1)
- TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
- VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
- } else {
- // Use vector version of the library call.
- StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
- assert(!VFnName.empty() && "Vector function name is empty.");
- VectorF = M->getFunction(VFnName);
- if (!VectorF) {
- // Generate a declaration
- FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
- VectorF =
- Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
- VectorF->copyAttributesFrom(F);
- }
- }
- assert(VectorF && "Can't create vector function.");
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 4> Args;
+ for (auto &I : enumerate(ArgOperands.operands())) {
+ // Some intrinsics have a scalar argument - don't replace it with a
+ // vector.
+ Value *Arg;
+ if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+ Arg = State.get(I.value(), Part);
+ else
+ Arg = State.get(I.value(), {0, 0});
+ Args.push_back(Arg);
+ }
+ Function *VectorF;
+ if (UseVectorIntrinsic) {
+ // Use vector version of the intrinsic.
+ Type *TysForDecl[] = {CI->getType()};
+ if (VF > 1)
+ TysForDecl[0] =
+ FixedVectorType::get(CI->getType()->getScalarType(), VF);
+ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+ assert(VectorF && "Can't retrieve vector intrinsic.");
+ } else {
+ // Use vector version of the function call.
+ const VFShape Shape =
+ VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+#ifndef NDEBUG
+ assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
+ "Can't create vector function.");
+#endif
+ VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ }
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
@@ -4400,16 +4428,31 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
VectorLoopValueMap.setVectorValue(&I, Part, V);
addMetadata(V, &I);
- }
-
- break;
}
+}
- default:
- // This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
- llvm_unreachable("Unhandled instruction!");
- } // end of switch.
+void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
+ VPUser &Operands,
+ bool InvariantCond,
+ VPTransformState &State) {
+ setDebugLocFromInst(Builder, &I);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ auto *InvarCond =
+ InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *Cond =
+ InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
+ Value *Op0 = State.get(Operands.getOperand(1), Part);
+ Value *Op1 = State.get(Operands.getOperand(2), Part);
+ Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
+ VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+ addMetadata(Sel, &I);
+ }
}
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@@ -4502,7 +4545,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
}
}
for (auto *I : ScalarPtrs)
- if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
+ if (!PossibleNonScalarPtrs.count(I)) {
LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
Worklist.insert(I);
}
@@ -4513,7 +4556,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// TODO: Once we are able to vectorize pointer induction variables we should
// no longer insert them into the worklist here.
auto *Latch = TheLoop->getLoopLatch();
- for (auto &Induction : *Legal->getInductionVars()) {
+ for (auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
@@ -4556,7 +4599,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// An induction variable will remain scalar if all users of the induction
// variable and induction variable update remain scalar.
- for (auto &Induction : *Legal->getInductionVars()) {
+ for (auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -4568,6 +4611,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
continue;
+ // If tail-folding is applied, the primary induction variable will be used
+ // to feed a vector compare.
+ if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
+ continue;
+
// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
@@ -4618,7 +4666,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
"Widening decision should be ready at this moment");
return WideningDecision == CM_Scalarize;
}
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
isLegalMaskedGather(Ty, Alignment))
: !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
@@ -4665,7 +4713,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
"Masked interleave-groups for predicated accesses are not enabled.");
auto *Ty = getMemInstValueType(I);
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
: TTI.isLegalMaskedStore(Ty, Alignment);
}
@@ -4803,7 +4851,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Add to the Worklist all consecutive and consecutive-like pointers that
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
- if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
+ if (!PossibleNonUniformPtrs.count(V))
addToWorklistIfAllowed(V);
// Expand Worklist in topological order: whenever a new instruction
@@ -4847,7 +4895,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// nodes separately. An induction variable will remain uniform if all users
// of the induction variable and induction variable update remain uniform.
// The code below handles both pointer and non-pointer induction variables.
- for (auto &Induction : *Legal->getInductionVars()) {
+ for (auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
@@ -4903,10 +4951,9 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
// FIXME: Avoid specializing for stride==1 instead of bailing out.
if (!Legal->getLAI()->getSymbolicStrides().empty()) {
- reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
+ reportVectorizationFailure("Runtime stride check for small trip count",
"runtime stride == 1 checks needed. Enable vectorization of "
- "this loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz",
+ "this loop without such check by compiling with -Os/-Oz",
"CantVersionLoopWithOptForSize", ORE, TheLoop);
return true;
}
@@ -4914,7 +4961,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
+ unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
@@ -4936,7 +4984,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return computeFeasibleMaxVF(TC);
+ return UserVF ? UserVF : computeFeasibleMaxVF(TC);
case CM_ScalarEpilogueNotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -4964,11 +5012,18 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
// Invalidate interleave groups that require an epilogue if we can't mask
// the interleave-group.
- if (!useMaskedInterleavedAccesses(TTI))
+ if (!useMaskedInterleavedAccesses(TTI)) {
+ assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
+ "No decisions should have been taken at this point");
+ // Note: There is no need to invalidate any cost modeling decisions here, as
+ // non where taken so far.
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+ }
- unsigned MaxVF = computeFeasibleMaxVF(TC);
- if (TC > 0 && TC % MaxVF == 0) {
+ unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
+ assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
+ unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
+ if (TC > 0 && TC % MaxVFtimesIC == 0) {
// Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxVF;
@@ -5015,7 +5070,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
- unsigned MaxVectorSize = WidestRegister / WidestType;
+ // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
+ // Note that both WidestRegister and WidestType may not be a powers of 2.
+ unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
<< " / " << WidestType << " bits.\n");
@@ -5140,7 +5197,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
Type *T = I.getType();
// Skip ignored values.
- if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
+ if (ValuesToIgnore.count(&I))
continue;
// Only examine Loads, Stores and PHINodes.
@@ -5152,7 +5209,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
if (auto *PN = dyn_cast<PHINode>(&I)) {
if (!Legal->isReductionVariable(PN))
continue;
- RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+ RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
T = RdxDesc.getRecurrenceType();
}
@@ -5294,7 +5351,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
- if (VF > 1 && !Legal->getReductionVars()->empty()) {
+ if (VF > 1 && !Legal->getReductionVars().empty()) {
LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
@@ -5325,7 +5382,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// by this point), we can increase the critical path length if the loop
// we're interleaving is inside another loop. Limit, by default to 2, so the
// critical path only gets increased by one reduction operation.
- if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
+ if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
@@ -5345,7 +5402,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
- bool HasReductions = !Legal->getReductionVars()->empty();
+ bool HasReductions = !Legal->getReductionVars().empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
@@ -5459,11 +5516,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
OpenIntervals.erase(ToRemove);
// Ignore instructions that are never used within the loop.
- if (Ends.find(I) == Ends.end())
+ if (!Ends.count(I))
continue;
// Skip ignored values.
- if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
+ if (ValuesToIgnore.count(I))
continue;
// For each VF find the maximum usage of registers.
@@ -5483,7 +5540,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
collectUniformsAndScalars(VFs[j]);
for (auto Inst : OpenIntervals) {
// Skip ignored values for VF > 1.
- if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+ if (VecValuesToIgnore.count(Inst))
continue;
if (isScalarAfterVectorization(Inst, VFs[j])) {
unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
@@ -5676,9 +5733,11 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
- ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
- true, false);
- ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+ ScalarCost += TTI.getScalarizationOverhead(
+ cast<VectorType>(ToVectorTy(I->getType(), VF)),
+ APInt::getAllOnesValue(VF), true, false);
+ ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
+ TTI::TCK_RecipThroughput);
}
// Compute the scalarization overhead of needed extractelement
@@ -5693,7 +5752,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
Worklist.push_back(J);
else if (needsExtract(J, VF))
ScalarCost += TTI.getScalarizationOverhead(
- ToVectorTy(J->getType(),VF), false, true);
+ cast<VectorType>(ToVectorTy(J->getType(), VF)),
+ APInt::getAllOnesValue(VF), false, true);
}
// Scale the total scalar cost by block probability.
@@ -5719,8 +5779,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
// For each instruction in the old loop.
for (Instruction &I : BB->instructionsWithoutDebug()) {
// Skip ignored values.
- if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
- (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
+ if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
continue;
VectorizationCostTy C = getInstructionCost(&I, VF);
@@ -5806,9 +5865,10 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLoadStoreAlignment(I);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
- Alignment, AS);
+ Alignment, AS,
+ TTI::TCK_RecipThroughput);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5832,20 +5892,22 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
- Type *VectorTy = ToVectorTy(ValTy, VF);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLoadStoreAlignment(I);
unsigned Cost = 0;
if (Legal->isMaskRequired(I))
- Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
- Alignment ? Alignment->value() : 0, AS);
+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+ CostKind);
else
- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+ CostKind, I);
bool Reverse = ConsecutiveStride < 0;
if (Reverse)
@@ -5856,19 +5918,22 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
- Type *VectorTy = ToVectorTy(ValTy, VF);
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
+ CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
}
StoreInst *SI = cast<StoreInst>(I);
bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
+ CostKind) +
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
@@ -5878,27 +5943,27 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
- Type *VectorTy = ToVectorTy(ValTy, VF);
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
- Value *Ptr = getLoadStorePointerOperand(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
- TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
- Legal->isMaskRequired(I),
- Alignment ? Alignment->value() : 0);
+ TTI.getGatherScatterOpCost(
+ I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+ TargetTransformInfo::TCK_RecipThroughput, I);
}
unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
unsigned VF) {
Type *ValTy = getMemInstValueType(I);
- Type *VectorTy = ToVectorTy(ValTy, VF);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(I);
auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.");
unsigned InterleaveFactor = Group->getFactor();
- Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+ auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
// Holds the indices of existing members in an interleaved load group.
// An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -5913,8 +5978,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
bool UseMaskForGaps =
Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
unsigned Cost = TTI.getInterleavedMemoryOpCost(
- I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
- Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
+ I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
+ AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
@@ -5932,11 +5997,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
// moment.
if (VF == 1) {
Type *ValTy = getMemInstValueType(I);
- const MaybeAlign Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
+ TTI::TCK_RecipThroughput, I);
}
return getWideningCost(I, VF);
}
@@ -5955,7 +6021,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
auto ForcedScalar = ForcedScalars.find(VF);
if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
auto InstSet = ForcedScalar->second;
- if (InstSet.find(I) != InstSet.end())
+ if (InstSet.count(I))
return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
}
@@ -5977,7 +6043,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
- Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+ Cost += TTI.getScalarizationOverhead(
+ cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6157,6 +6224,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
@@ -6173,21 +6241,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool ScalarPredicatedBB = false;
BranchInst *BI = cast<BranchInst>(I);
if (VF > 1 && BI->isConditional() &&
- (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
- PredicatedBBsAfterVectorization.end() ||
- PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
- PredicatedBBsAfterVectorization.end()))
+ (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+ PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
ScalarPredicatedBB = true;
if (ScalarPredicatedBB) {
// Return cost for branches around scalarized and predicated blocks.
- Type *Vec_i1Ty =
- VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
- return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
- (TTI.getCFInstrCost(Instruction::Br) * VF));
+ auto *Vec_i1Ty =
+ FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+ return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
+ false, true) +
+ (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
// The back-edge branch will remain, as will all scalar branches.
- return TTI.getCFInstrCost(Instruction::Br);
+ return TTI.getCFInstrCost(Instruction::Br, CostKind);
else
// This branch will be eliminated by if-conversion.
return 0;
@@ -6202,7 +6269,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
- VectorTy, VF - 1, VectorType::get(RetTy, 1));
+ cast<VectorType>(VectorTy), VF - 1,
+ FixedVectorType::get(RetTy, 1));
// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
// converted into select instructions. We require N - 1 selects per phi
@@ -6211,9 +6279,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return (Phi->getNumIncomingValues() - 1) *
TTI.getCmpSelInstrCost(
Instruction::Select, ToVectorTy(Phi->getType(), VF),
- ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
+ ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
+ CostKind);
- return TTI.getCFInstrCost(Instruction::PHI);
+ return TTI.getCFInstrCost(Instruction::PHI, CostKind);
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -6230,10 +6299,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// that we will create. This cost is likely to be zero. The phi node
// cost, if any, should be scaled by the block probability because it
// models a copy at the end of each predicated block.
- Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+ Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
// The cost of the non-predicated instruction.
- Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+ Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
// The cost of insertelement and extractelement instructions needed for
// scalarization.
@@ -6274,13 +6343,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
SmallVector<const Value *, 4> Operands(I->operand_values());
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+ I->getOpcode(), VectorTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+ I->getOpcode(), VectorTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
I->getOperand(0), I);
@@ -6291,9 +6362,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
- CondTy = VectorType::get(CondTy, VF);
+ CondTy = FixedVectorType::get(CondTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
+ CostKind, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
@@ -6302,7 +6374,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
VectorTy = ToVectorTy(ValTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
+ I);
}
case Instruction::Store:
case Instruction::Load: {
@@ -6335,7 +6408,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (isOptimizableIVTruncate(I, VF)) {
auto *Trunc = cast<TruncInst>(I);
return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
- Trunc->getSrcTy(), Trunc);
+ Trunc->getSrcTy(), CostKind, Trunc);
}
Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -6361,7 +6434,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
- return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
+ return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
+ CostKind, I);
}
case Instruction::Call: {
bool NeedToScalarize;
@@ -6374,7 +6448,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
- return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+ return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
+ CostKind) +
getScalarizationOverhead(I, VF);
} // end of switch.
}
@@ -6397,6 +6472,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
@@ -6424,14 +6500,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore type-promoting instructions we identified during reduction
// detection.
- for (auto &Reduction : *Legal->getReductionVars()) {
+ for (auto &Reduction : Legal->getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
// detection.
- for (auto &Induction : *Legal->getInductionVars()) {
+ for (auto &Induction : Legal->getInductionVars()) {
InductionDescriptor &IndDes = Induction.second;
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
@@ -6490,9 +6566,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
return VectorizationFactor::Disabled();
}
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
+ unsigned UserIC) {
assert(OrigLoop->empty() && "Inner loop expected.");
- Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
+ Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
return None;
@@ -6503,7 +6580,11 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
dbgs()
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n");
- CM.InterleaveInfo.reset();
+ if (CM.InterleaveInfo.invalidateGroups())
+ // Invalidating interleave groups also requires invalidating all decisions
+ // based on them, which includes widening decisions and uniform and scalar
+ // values.
+ CM.invalidateCostModelingDecisions();
}
if (UserVF) {
@@ -6563,6 +6644,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
&ILV, CallbackILV};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);
+ State.CanonicalIV = ILV.Induction;
//===------------------------------------------------===//
//
@@ -6595,12 +6677,11 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
// We create new "steps" for induction variable updates to which the original
// induction variables map. An original update instruction will be dead if
// all its users except the induction variable are dead.
- for (auto &Induction : *Legal->getInductionVars()) {
+ for (auto &Induction : Legal->getInductionVars()) {
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
- DeadInstructions.end();
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
@@ -6716,7 +6797,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
assert(BI && "Unexpected terminator found");
- if (!BI->isConditional())
+ if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
@@ -6749,9 +6830,21 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// Introduce the early-exit compare IV <= BTC to form header block mask.
// This is used instead of IV < TC because TC may wrap, unlike BTC.
- VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+ // Start by constructing the desired canonical IV.
+ VPValue *IV = nullptr;
+ if (Legal->getPrimaryInduction())
+ IV = Plan->getVPValue(Legal->getPrimaryInduction());
+ else {
+ auto IVRecipe = new VPWidenCanonicalIVRecipe();
+ Builder.getInsertBlock()->appendRecipe(IVRecipe);
+ IV = IVRecipe->getVPValue();
+ }
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ bool TailFolded = !CM.isScalarEpilogueAllowed();
+ if (TailFolded && CM.TTI.emitGetActiveLaneMask())
+ BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
+ else
+ BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
return BlockMaskCache[BB] = BlockMask;
}
@@ -6775,8 +6868,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
VPWidenMemoryInstructionRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
VPlanPtr &Plan) {
- if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
- return nullptr;
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ "Must be called with either a load or store");
auto willWiden = [&](unsigned VF) -> bool {
if (VF == 1)
@@ -6801,22 +6894,29 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
Mask = createBlockInMask(I->getParent(), Plan);
VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
- return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
+ if (LoadInst *Load = dyn_cast<LoadInst>(I))
+ return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
+
+ StoreInst *Store = cast<StoreInst>(I);
+ VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
+ return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
}
VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
- if (PHINode *Phi = dyn_cast<PHINode>(I)) {
- // Check if this is an integer or fp induction. If so, build the recipe that
- // produces its scalar and vector values.
- InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction)
- return new VPWidenIntOrFpInductionRecipe(Phi);
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
+ // Check if this is an integer or fp induction. If so, build the recipe that
+ // produces its scalar and vector values.
+ InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction)
+ return new VPWidenIntOrFpInductionRecipe(Phi);
- return nullptr;
- }
+ return nullptr;
+}
+VPWidenIntOrFpInductionRecipe *
+VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
+ VFRange &Range) const {
// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -6830,54 +6930,89 @@ VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
[=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
};
- if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
- isOptimizableIVTruncate(I), Range))
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ isOptimizableIVTruncate(I), Range))
return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
- cast<TruncInst>(I));
+ I);
return nullptr;
}
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
- PHINode *Phi = dyn_cast<PHINode>(I);
- if (!Phi || Phi->getParent() == OrigLoop->getHeader())
- return nullptr;
-
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
- SmallVector<VPValue *, 2> Masks;
+ SmallVector<VPValue *, 2> Operands;
unsigned NumIncoming = Phi->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
VPValue *EdgeMask =
createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
assert((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask");
+ Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
if (EdgeMask)
- Masks.push_back(EdgeMask);
+ Operands.push_back(EdgeMask);
}
- return new VPBlendRecipe(Phi, Masks);
+ return new VPBlendRecipe(Phi, Operands);
}
-bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
- VFRange &Range) {
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
+ VPlan &Plan) const {
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
- [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+ [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
+ Range);
if (IsPredicated)
- return false;
+ return nullptr;
+
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+ return nullptr;
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize = false;
+ unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+ return UseVectorIntrinsic || !NeedToScalarize;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
+}
+bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
+ assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
+ !isa<StoreInst>(I) && "Instruction should have been handled earlier");
+ // Instruction should be widened, unless it is scalar after vectorization,
+ // scalarization is profitable or it is predicated.
+ auto WillScalarize = [this, I](unsigned VF) -> bool {
+ return CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF) ||
+ CM.isScalarWithPredication(I, VF);
+ };
+ return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
+ Range);
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
auto IsVectorizableOpcode = [](unsigned Opcode) {
switch (Opcode) {
case Instruction::Add:
case Instruction::And:
case Instruction::AShr:
case Instruction::BitCast:
- case Instruction::Br:
- case Instruction::Call:
case Instruction::FAdd:
case Instruction::FCmp:
case Instruction::FDiv:
@@ -6891,11 +7026,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
case Instruction::FSub:
case Instruction::ICmp:
case Instruction::IntToPtr:
- case Instruction::Load:
case Instruction::LShr:
case Instruction::Mul:
case Instruction::Or:
- case Instruction::PHI:
case Instruction::PtrToInt:
case Instruction::SDiv:
case Instruction::Select:
@@ -6903,7 +7036,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
case Instruction::Shl:
case Instruction::SIToFP:
case Instruction::SRem:
- case Instruction::Store:
case Instruction::Sub:
case Instruction::Trunc:
case Instruction::UDiv:
@@ -6917,60 +7049,10 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
};
if (!IsVectorizableOpcode(I->getOpcode()))
- return false;
-
- if (CallInst *CI = dyn_cast<CallInst>(I)) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
- ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
- return false;
- }
-
- auto willWiden = [&](unsigned VF) -> bool {
- if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF)))
- return false;
- if (CallInst *CI = dyn_cast<CallInst>(I)) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- // The following case may be scalarized depending on the VF.
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize;
- unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
- bool UseVectorIntrinsic =
- ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
- return UseVectorIntrinsic || !NeedToScalarize;
- }
- if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
- assert(CM.getWideningDecision(I, VF) ==
- LoopVectorizationCostModel::CM_Scalarize &&
- "Memory widening decisions should have been taken care by now");
- return false;
- }
- return true;
- };
-
- if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
- return false;
- // If this ingredient's recipe is to be recorded, keep its recipe a singleton
- // to avoid having to split recipes later.
- bool IsSingleton = Ingredient2Recipe.count(I);
+ return nullptr;
// Success: widen this instruction.
-
- // Use the default widening recipe. We optimize the common case where
- // consecutive instructions can be represented by a single recipe.
- if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
- LastExtensibleRecipe->appendInstruction(I))
- return true;
-
- VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
- if (!IsSingleton)
- LastExtensibleRecipe = WidenRecipe;
- setRecipe(I, WidenRecipe);
- VPBB->appendRecipe(WidenRecipe);
- return true;
+ return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
}
VPBasicBlock *VPRecipeBuilder::handleReplication(
@@ -6984,7 +7066,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
- auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+ auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
+ IsUniform, IsPredicated);
setRecipe(I, Recipe);
// Find if I uses a predicated instruction. If so, it will use its scalar
@@ -7041,43 +7124,45 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
return Region;
}
-bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
- VPlanPtr &Plan, VPBasicBlock *VPBB) {
- VPRecipeBase *Recipe = nullptr;
-
- // First, check for specific widening recipes that deal with memory
+VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+ VFRange &Range,
+ VPlanPtr &Plan) {
+ // First, check for specific widening recipes that deal with calls, memory
// operations, inductions and Phi nodes.
- if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
- (Recipe = tryToOptimizeInduction(Instr, Range)) ||
- (Recipe = tryToBlend(Instr, Plan)) ||
- (isa<PHINode>(Instr) &&
- (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
- setRecipe(Instr, Recipe);
- VPBB->appendRecipe(Recipe);
- return true;
- }
+ if (auto *CI = dyn_cast<CallInst>(Instr))
+ return tryToWidenCall(CI, Range, *Plan);
- // Handle GEP widening.
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
- auto Scalarize = [&](unsigned VF) {
- return CM.isScalarWithPredication(Instr, VF) ||
- CM.isScalarAfterVectorization(Instr, VF) ||
- CM.isProfitableToScalarize(Instr, VF);
- };
- if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
- return false;
- VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
- setRecipe(Instr, Recipe);
- VPBB->appendRecipe(Recipe);
- return true;
+ if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+ return tryToWidenMemory(Instr, Range, Plan);
+
+ VPRecipeBase *Recipe;
+ if (auto Phi = dyn_cast<PHINode>(Instr)) {
+ if (Phi->getParent() != OrigLoop->getHeader())
+ return tryToBlend(Phi, Plan);
+ if ((Recipe = tryToOptimizeInductionPHI(Phi)))
+ return Recipe;
+ return new VPWidenPHIRecipe(Phi);
}
- // Check if Instr is to be widened by a general VPWidenRecipe, after
- // having first checked for specific widening recipes.
- if (tryToWiden(Instr, VPBB, Range))
- return true;
+ if (isa<TruncInst>(Instr) &&
+ (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
+ return Recipe;
- return false;
+ if (!shouldWiden(Instr, Range))
+ return nullptr;
+
+ if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
+ return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
+ OrigLoop);
+
+ if (auto *SI = dyn_cast<SelectInst>(Instr)) {
+ bool InvariantCond =
+ PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
+ return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
+ InvariantCond);
+ }
+
+ return tryToWiden(Instr, *Plan);
}
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
@@ -7097,13 +7182,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
NeedDef.insert(Branch->getCondition());
}
- // If the tail is to be folded by masking, the primary induction variable
- // needs to be represented in VPlan for it to model early-exit masking.
+ // If the tail is to be folded by masking, the primary induction variable, if
+ // exists needs to be represented in VPlan for it to model early-exit masking.
// Also, both the Phi and the live-out instruction of each reduction are
// required in order to introduce a select between them in VPlan.
if (CM.foldTailByMasking()) {
- NeedDef.insert(Legal->getPrimaryInduction());
- for (auto &Reduction : *Legal->getReductionVars()) {
+ if (Legal->getPrimaryInduction())
+ NeedDef.insert(Legal->getPrimaryInduction());
+ for (auto &Reduction : Legal->getReductionVars()) {
NeedDef.insert(Reduction.first);
NeedDef.insert(Reduction.second.getLoopExitInstr());
}
@@ -7118,28 +7204,39 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
SmallPtrSet<Instruction *, 4> DeadInstructions;
collectTriviallyDeadInstructions(DeadInstructions);
+ // Add assume instructions we need to drop to DeadInstructions, to prevent
+ // them from being added to the VPlan.
+ // TODO: We only need to drop assumes in blocks that get flattend. If the
+ // control flow is preserved, we should keep them.
+ auto &ConditionalAssumes = Legal->getConditionalAssumes();
+ DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
+
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ // Dead instructions do not need sinking. Remove them from SinkAfter.
+ for (Instruction *I : DeadInstructions)
+ SinkAfter.erase(I);
+
for (unsigned VF = MinVF; VF < MaxVF + 1;) {
VFRange SubRange = {VF, MaxVF + 1};
- VPlans.push_back(
- buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+ VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
+ DeadInstructions, SinkAfter));
VF = SubRange.End;
}
}
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+ SmallPtrSetImpl<Instruction *> &DeadInstructions,
+ const DenseMap<Instruction *, Instruction *> &SinkAfter) {
// Hold a mapping from predicated instructions to their recipes, in order to
// fix their AlsoPack behavior if a user is determined to replicate and use a
// scalar instead of vector value.
DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
- DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
-
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
@@ -7177,8 +7274,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
// ---------------------------------------------------------------------------
// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+ auto Plan = std::make_unique<VPlan>();
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
- auto Plan = std::make_unique<VPlan>(VPBB);
+ Plan->setEntry(VPBB);
// Represent values that will have defs inside VPlan.
for (Value *V : NeedDef)
@@ -7199,17 +7297,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
Builder.setInsertPoint(VPBB);
// Introduce each ingredient into VPlan.
+ // TODO: Model and preserve debug instrinsics in VPlan.
for (Instruction &I : BB->instructionsWithoutDebug()) {
Instruction *Instr = &I;
// First filter out irrelevant instructions, to ensure no recipes are
// built for them.
- if (isa<BranchInst>(Instr) ||
- DeadInstructions.find(Instr) != DeadInstructions.end())
+ if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
continue;
- if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
+ if (auto Recipe =
+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+ RecipeBuilder.setRecipe(Instr, Recipe);
+ VPBB->appendRecipe(Recipe);
continue;
+ }
// Otherwise, if all widening options failed, Instruction is to be
// replicated. This may create a successor for VPBB.
@@ -7264,7 +7366,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
if (CM.foldTailByMasking()) {
Builder.setInsertPoint(VPBB);
auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
- for (auto &Reduction : *Legal->getReductionVars()) {
+ for (auto &Reduction : Legal->getReductionVars()) {
VPValue *Phi = Plan->getVPValue(Reduction.first);
VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
@@ -7330,32 +7432,37 @@ Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
return ILV.getOrCreateScalarValue(V, Instance);
}
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n"
- << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
- getAddr()->printAsOperand(O);
+ getAddr()->printAsOperand(O, SlotTracker);
VPValue *Mask = getMask();
if (Mask) {
O << ", ";
- Mask->printAsOperand(O);
+ Mask->printAsOperand(O, SlotTracker);
}
- O << "\\l\"";
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *I = IG->getMember(i))
- O << " +\n"
- << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
+ O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i;
+}
+
+void VPWidenCallRecipe::execute(VPTransformState &State) {
+ State.ILV->widenCallInstruction(Ingredient, User, State);
+}
+
+void VPWidenSelectRecipe::execute(VPTransformState &State) {
+ State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
}
void VPWidenRecipe::execute(VPTransformState &State) {
- for (auto &Instr : make_range(Begin, End))
- State.ILV->widenInstruction(Instr);
+ State.ILV->widenInstruction(Ingredient, User, State);
}
void VPWidenGEPRecipe::execute(VPTransformState &State) {
- State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
- IsIndexLoopInvariant);
+ State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
+ IsIndexLoopInvariant, State);
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
@@ -7376,27 +7483,27 @@ void VPBlendRecipe::execute(VPTransformState &State) {
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
- unsigned NumIncoming = Phi->getNumIncomingValues();
+ unsigned NumIncoming = getNumIncomingValues();
- assert((User || NumIncoming == 1) &&
- "Multiple predecessors with predecessors having a full mask");
// Generate a sequence of selects of the form:
// SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // ( ...)))
+ // SELECT(Mask2, In2,
+ // SELECT(Mask1, In1,
+ // In0)))
+ // Note that Mask0 is never used: lanes for which no path reaches this phi and
+ // are essentially undef are taken from In0.
InnerLoopVectorizer::VectorParts Entry(State.UF);
for (unsigned In = 0; In < NumIncoming; ++In) {
for (unsigned Part = 0; Part < State.UF; ++Part) {
// We might have single edge PHIs (blocks) - use an identity
// 'select' for the first PHI operand.
- Value *In0 =
- State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+ Value *In0 = State.get(getIncomingValue(In), Part);
if (In == 0)
Entry[Part] = In0; // Initialize with the first incoming value.
else {
// Select between the current value and the previous incoming edge
// based on the incoming mask.
- Value *Cond = State.get(User->getOperand(In), Part);
+ Value *Cond = State.get(getMask(In), Part);
Entry[Part] =
State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
}
@@ -7408,19 +7515,19 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
- State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
- getMask());
+ State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
}
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
- State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
+ State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
+ IsPredicated, State);
// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF > 1) {
// If we're constructing lane 0, initialize to start from undef.
if (State.Instance->Lane == 0) {
- Value *Undef =
- UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+ Value *Undef = UndefValue::get(
+ FixedVectorType::get(Ingredient->getType(), State.VF));
State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
}
State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
@@ -7434,7 +7541,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
unsigned EndLane = IsUniform ? 1 : State.VF;
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
+ State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
+ IsPredicated, State);
}
void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
@@ -7444,15 +7552,14 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
unsigned Lane = State.Instance->Lane;
Value *ConditionBit = nullptr;
- if (!User) // Block in mask is all-one.
- ConditionBit = State.Builder.getTrue();
- else {
- VPValue *BlockInMask = User->getOperand(0);
+ VPValue *BlockInMask = getMask();
+ if (BlockInMask) {
ConditionBit = State.get(BlockInMask, Part);
if (ConditionBit->getType()->isVectorTy())
ConditionBit = State.Builder.CreateExtractElement(
ConditionBit, State.Builder.getInt32(Lane));
- }
+ } else // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
// Replace the temporary unreachable terminator with a new conditional branch,
// whose two destinations will be set later when they are created.
@@ -7496,7 +7603,9 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
- State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
+ VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
+ State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
+ getMask());
}
// Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -7513,16 +7622,15 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
PGSOQueryType::IRPass);
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
- if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+ if (OptSize)
return CM_ScalarEpilogueNotAllowedOptSize;
bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
!PreferPredicateOverEpilog;
// 2) Next, if disabling predication is requested on the command line, honour
- // this and request a scalar epilogue. Also do this if we don't have a
- // primary induction variable, which is required for predication.
- if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+ // this and request a scalar epilogue.
+ if (PredicateOptDisabled)
return CM_ScalarEpilogueAllowed;
// 3) and 4) look if enabling predication is requested on the command line,
@@ -7549,6 +7657,10 @@ static bool processLoopInVPlanNativePath(
OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
+ if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
+ LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
+ return false;
+ }
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
@@ -7561,7 +7673,7 @@ static bool processLoopInVPlanNativePath(
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();
@@ -7587,10 +7699,16 @@ static bool processLoopInVPlanNativePath(
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();
- LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
+LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
+ : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
+ !EnableLoopInterleaving),
+ VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
+ !EnableLoopVectorization) {}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->empty()) &&
"VPlan-native path is not enabled. Only process inner loops.");
@@ -7720,17 +7838,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
- // Get user vectorization factor.
+ // Get user vectorization factor and interleave count.
unsigned UserVF = Hints.getWidth();
+ unsigned UserIC = Hints.getInterleave();
// Plan how to best vectorize, return the best VF and its cost.
- Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
+ Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
- unsigned UserIC = Hints.getInterleave();
if (MaybeVF) {
VF = *MaybeVF;
@@ -7883,14 +8001,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Hints.setAlreadyVectorized();
}
- LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+ assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
return true;
}
-bool LoopVectorizePass::runImpl(
+LoopVectorizeResult LoopVectorizePass::runImpl(
Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
- DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
+ DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
@@ -7915,9 +8033,9 @@ bool LoopVectorizePass::runImpl(
// interleaving.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
TTI->getMaxInterleaveFactor(1) < 2)
- return false;
+ return LoopVectorizeResult(false, false);
- bool Changed = false;
+ bool Changed = false, CFGChanged = false;
// The vectorizer requires loops to be in simplified form.
// Since simplification may add new inner loops, it has to run before the
@@ -7925,7 +8043,7 @@ bool LoopVectorizePass::runImpl(
// will simplify all loops, regardless of whether anything end up being
// vectorized.
for (auto &L : *LI)
- Changed |=
+ Changed |= CFGChanged |=
simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
// Build up a worklist of inner-loops to vectorize. This is necessary as
@@ -7946,11 +8064,11 @@ bool LoopVectorizePass::runImpl(
// transform.
Changed |= formLCSSARecursively(*L, *DT, LI, SE);
- Changed |= processLoop(L);
+ Changed |= CFGChanged |= processLoop(L);
}
// Process each loop nest in the function.
- return Changed;
+ return LoopVectorizeResult(Changed, CFGChanged);
}
PreservedAnalyses LoopVectorizePass::run(Function &F,
@@ -7975,13 +8093,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
- const ModuleAnalysisManager &MAM =
- AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
ProfileSummaryInfo *PSI =
- MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- bool Changed =
+ MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ LoopVectorizeResult Result =
runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
- if (!Changed)
+ if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
PreservedAnalyses PA;
@@ -7995,5 +8112,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
}
PA.preserve<BasicAA>();
PA.preserve<GlobalsAA>();
+ if (!Result.MadeCFGChange)
+ PA.preserveSet<CFGAnalyses>();
return PA;
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index aabd974cd73e4..5bc35aa4695f8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -47,6 +47,7 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -85,6 +86,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
@@ -107,9 +109,8 @@ using namespace slpvectorizer;
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
-cl::opt<bool>
- llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
- cl::desc("Run the SLP vectorization passes"));
+cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
@@ -284,7 +285,7 @@ static bool isCommutative(Instruction *I) {
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
- unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+ unsigned Size = EI0->getVectorOperandType()->getNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
@@ -293,7 +294,7 @@ isShuffle(ArrayRef<Value *> VL) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
- if (Vec->getType()->getVectorNumElements() != Size)
+ if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
@@ -377,6 +378,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {
return S.OpValue;
}
+/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// instruction for SLP vectorization.
+///
+/// Example of unsupported opcode is SDIV that can potentially cause UB if the
+/// "shuffled out" lane would result in division by zero.
+static bool isValidForAlternation(unsigned Opcode) {
+ if (Instruction::isIntDivRem(Opcode))
+ return false;
+
+ return true;
+}
+
/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
@@ -399,7 +412,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
- if (Opcode == AltOpcode) {
+ if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
+ isValidForAlternation(Opcode)) {
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
@@ -411,6 +425,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
if (InstOpcode == Opcode || InstOpcode == AltOpcode)
continue;
if (Opcode == AltOpcode) {
+ assert(isValidForAlternation(Opcode) &&
+ isValidForAlternation(InstOpcode) &&
+ "Cast isn't safe for alternation, logic needs to be updated!");
AltOpcode = InstOpcode;
AltIndex = Cnt;
continue;
@@ -613,7 +630,7 @@ public:
/// the stored value. Otherwise, the size is the width of the largest loaded
/// value reaching V. This method is used by the vectorizer to calculate
/// vectorization factors.
- unsigned getVectorElementSize(Value *V) const;
+ unsigned getVectorElementSize(Value *V);
/// Compute the minimum type sizes required to represent the entries in a
/// vectorizable tree.
@@ -650,6 +667,15 @@ public:
/// may not be necessary.
bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+ /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ bool isLoadCombineCandidate() const;
+
OptimizationRemarkEmitter *getORE() { return ORE; }
/// This structure holds any data we need about the edges being traversed
@@ -816,13 +842,12 @@ public:
// Extracts from consecutive indexes of the same vector better score as
// the extracts could be optimized away.
- auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
- auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
- if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
- cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
- cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
+ Value *EV;
+ ConstantInt *Ex1Idx, *Ex2Idx;
+ if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
+ match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
+ Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
return VLOperands::ScoreConsecutiveExtracts;
- }
auto *I1 = dyn_cast<Instruction>(V1);
auto *I2 = dyn_cast<Instruction>(V2);
@@ -852,7 +877,7 @@ public:
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
const std::pair<Value *, int> &RHS) {
int Cost = 0;
- SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
+ std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
Value *V = Values[Idx].first;
// Calculate the absolute lane, using the minimum relative lane of LHS
@@ -1385,7 +1410,8 @@ private:
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
+ int getGatherCost(VectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
@@ -1422,7 +1448,7 @@ private:
return VL.size() == ReuseShuffleIndices.size() &&
std::equal(
VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
- [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
+ [this](Value *V, int Idx) { return V == Scalars[Idx]; });
}
/// A vector of scalars.
@@ -1436,7 +1462,7 @@ private:
EntryState State;
/// Does this sequence require some shuffling?
- SmallVector<unsigned, 4> ReuseShuffleIndices;
+ SmallVector<int, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
ArrayRef<unsigned> ReorderIndices;
@@ -1690,6 +1716,9 @@ private:
/// Maps a specific scalar to its tree entry.
SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+ /// Maps a value to the proposed vectorizable size.
+ SmallDenseMap<Value *, unsigned> InstrElementSize;
+
/// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
@@ -2001,6 +2030,20 @@ private:
if (TreeEntry *TE = BundleMember->TE) {
int Lane = BundleMember->Lane;
assert(Lane >= 0 && "Lane not set");
+
+ // Since vectorization tree is being built recursively this assertion
+ // ensures that the tree entry has all operands set before reaching
+ // this code. Couple of exceptions known at the moment are extracts
+ // where their second (immediate) operand is not added. Since
+ // immediates do not affect scheduler behavior this is considered
+ // okay.
+ auto *In = TE->getMainOp();
+ assert(In &&
+ (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+ In->getNumOperands() == TE->getNumOperands()) &&
+ "Missed TreeEntry operands?");
+ (void)In; // fake use to avoid build failure when assertions disabled
+
for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
OpIdx != NumOperands; ++OpIdx)
if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
@@ -2323,6 +2366,7 @@ BoUpSLP::~BoUpSLP() {
"trying to erase instruction with users.");
Pair.getFirst()->eraseFromParent();
}
+ assert(!verifyFunction(*F, &dbgs()));
}
void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
@@ -2978,19 +3022,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
case Instruction::Call: {
- // Check if the calls are all to the same vectorizable intrinsic.
+ // Check if the calls are all to the same vectorizable intrinsic or
+ // library function.
CallInst *CI = cast<CallInst>(VL0);
- // Check if this is an Intrinsic call or something that can be
- // represented by an intrinsic call
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (!isTriviallyVectorizable(ID)) {
+
+ VFShape Shape = VFShape::get(
+ *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!VecFunc && !isTriviallyVectorizable(ID)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
return;
}
- Function *Int = CI->getCalledFunction();
+ Function *F = CI->getCalledFunction();
unsigned NumArgs = CI->getNumArgOperands();
SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
for (unsigned j = 0; j != NumArgs; ++j)
@@ -2998,8 +3047,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ScalarArgs[j] = CI->getArgOperand(j);
for (Value *V : VL) {
CallInst *CI2 = dyn_cast<CallInst>(V);
- if (!CI2 || CI2->getCalledFunction() != Int ||
+ if (!CI2 || CI2->getCalledFunction() != F ||
getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ (VecFunc &&
+ VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
!CI->hasIdenticalOperandBundleSchema(*CI2)) {
BS.cancelScheduling(VL, VL0);
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -3101,7 +3152,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
unsigned N = 1;
Type *EltTy = T;
- while (isa<CompositeType>(EltTy)) {
+ while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
+ isa<VectorType>(EltTy)) {
if (auto *ST = dyn_cast<StructType>(EltTy)) {
// Check that struct is homogeneous.
for (const auto *Ty : ST->elements())
@@ -3109,16 +3161,19 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
return 0;
N *= ST->getNumElements();
EltTy = *ST->element_begin();
+ } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
+ N *= AT->getNumElements();
+ EltTy = AT->getElementType();
} else {
- auto *SeqT = cast<SequentialType>(EltTy);
- N *= SeqT->getNumElements();
- EltTy = SeqT->getElementType();
+ auto *VT = cast<VectorType>(EltTy);
+ N *= VT->getNumElements();
+ EltTy = VT->getElementType();
}
}
if (!isValidElementType(EltTy))
return 0;
- uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+ uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
return 0;
return N;
@@ -3148,7 +3203,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
- NElts = Vec->getType()->getVectorNumElements();
+ NElts = cast<VectorType>(Vec->getType())->getNumElements();
}
if (NElts != VL.size())
@@ -3198,6 +3253,35 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
});
}
+static std::pair<unsigned, unsigned>
+getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
+ int IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+
+ auto Shape =
+ VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ int LibCost = IntrinsicCost;
+ if (!CI->isNoBuiltin() && VecFunc) {
+ // Calculate the cost of the vector library call.
+ SmallVector<Type *, 4> VecTys;
+ for (Use &Arg : CI->args())
+ VecTys.push_back(
+ FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
+ // If the corresponding vector call is cheaper, return its cost.
+ LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
+ TTI::TCK_RecipThroughput);
+ }
+ return {IntrinsicCost, LibCost};
+}
+
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@@ -3206,12 +3290,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
ScalarTy = SI->getValueOperand()->getType();
else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
ScalarTy = CI->getOperand(0)->getType();
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// If we have computed a smaller type for the expression, update VecTy so
// that the costs will be accurate.
if (MinBWs.count(VL[0]))
- VecTy = VectorType::get(
+ VecTy = FixedVectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
@@ -3251,6 +3336,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return ReuseShuffleCost + getGatherCost(VL);
}
+ assert(E->State == TreeEntry::Vectorize && "Unhandled state");
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -3260,7 +3346,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
return 0;
case Instruction::ExtractValue:
- case Instruction::ExtractElement:
+ case Instruction::ExtractElement: {
if (NeedToShuffleReuses) {
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
@@ -3289,43 +3375,41 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
- if (E->State == TreeEntry::Vectorize) {
- int DeadCost = ReuseShuffleCost;
- if (!E->ReorderIndices.empty()) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- DeadCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- Instruction *E = cast<Instruction>(VL[i]);
- // If all users are going to be vectorized, instruction can be
- // considered as dead.
- // The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E)) {
- // Take credit for instruction that will become dead.
- if (E->hasOneUse()) {
- Instruction *Ext = E->user_back();
- if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
- all_of(Ext->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- DeadCost -= TTI->getExtractWithExtendCost(
- Ext->getOpcode(), Ext->getType(), VecTy, i);
- // Add back the cost of s|zext which is subtracted separately.
- DeadCost += TTI->getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
- continue;
- }
+ int DeadCost = ReuseShuffleCost;
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Instruction *E = cast<Instruction>(VL[i]);
+ // If all users are going to be vectorized, instruction can be
+ // considered as dead.
+ // The same, if have only one user, it will be vectorized for sure.
+ if (areAllUsersVectorized(E)) {
+ // Take credit for instruction that will become dead.
+ if (E->hasOneUse()) {
+ Instruction *Ext = E->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
+ Ext->getOpcode(), Ext->getType(), VecTy, i);
+ // Add back the cost of s|zext which is subtracted separately.
+ DeadCost += TTI->getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
+ Ext);
+ continue;
}
- DeadCost -=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
+ DeadCost -=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
}
- return DeadCost;
}
- return ReuseShuffleCost + getGatherCost(VL);
-
+ return DeadCost;
+ }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -3340,7 +3424,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
int ScalarEltCost =
- TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
+ VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
@@ -3348,12 +3433,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
// Calculate the cost of this instruction.
int ScalarCost = VL.size() * ScalarEltCost;
- VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+ auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
int VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost = ReuseShuffleCost +
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+ CostKind, VL0);
}
return VecCost - ScalarCost;
}
@@ -3362,13 +3448,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::Select: {
// Calculate the cost of this instruction.
int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(), VL0);
+ Builder.getInt1Ty(),
+ CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
+ int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
+ CostKind, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
@@ -3429,13 +3517,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
SmallVector<const Value *, 4> Operands(VL0->operand_values());
int ScalarEltCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
+ E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
+ Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
+ E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
+ Operands, VL0);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3445,26 +3535,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OK_UniformConstantValue;
int ScalarEltCost =
- TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
+ Op1VK, Op2VK);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
int VecCost =
- TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
+ Op1VK, Op2VK);
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
+ Align alignment = cast<LoadInst>(VL0)->getAlign();
int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
+ CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
int VecLdCost =
- TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+ CostKind, VL0);
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecLdCost += TTI->getShuffleCost(
@@ -3477,14 +3571,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
bool IsReorder = !E->ReorderIndices.empty();
auto *SI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
- MaybeAlign Alignment(SI->getAlignment());
+ Align Alignment = SI->getAlign();
int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
+ CostKind, VL0);
if (NeedToShuffleReuses)
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, Alignment, 0, VL0);
+ VecTy, Alignment, 0, CostKind, VL0);
if (IsReorder) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecStCost += TTI->getShuffleCost(
@@ -3497,24 +3592,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- SmallVector<Type *, 4> ScalarTys;
- for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
- ScalarTys.push_back(CI->getArgOperand(op)->getType());
-
- FastMathFlags FMF;
- if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
- FMF = FPMO->getFastMathFlags();
-
- int ScalarEltCost =
- TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
+ int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
- SmallVector<Value *, 4> Args(CI->arg_operands());
- int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
- VecTy->getNumElements());
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+ int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3533,34 +3619,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
- ReuseShuffleCost -= TTI->getInstructionCost(
- I, TargetTransformInfo::TCK_RecipThroughput);
+ ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
- ReuseShuffleCost += TTI->getInstructionCost(
- I, TargetTransformInfo::TCK_RecipThroughput);
+ ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
}
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- ScalarCost += TTI->getInstructionCost(
- I, TargetTransformInfo::TCK_RecipThroughput);
+ ScalarCost += TTI->getInstructionCost(I, CostKind);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
int VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
- VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
- VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+ VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
+ CostKind);
} else {
Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
- VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
- VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
- VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
- VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
+ auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+ auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+ CostKind);
+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+ CostKind);
}
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost;
@@ -3596,24 +3682,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
return true;
}
-bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
- if (RdxOpcode != Instruction::Or)
- return false;
-
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-
- // Look past the reduction to find a source value. Arbitrarily follow the
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+ TargetTransformInfo *TTI) {
+ // Look past the root to find a source value. Arbitrarily follow the
// path through operand 0 of any 'or'. Also, peek through optional
// shift-left-by-constant.
- Value *ZextLoad = FirstReduced;
- while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
- match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+ Value *ZextLoad = Root;
+ while (!isa<ConstantExpr>(ZextLoad) &&
+ (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
- // Check if the input to the reduction is an extended load.
+ // Check if the input is an extended load of the required or/shift expression.
Value *LoadPtr;
- if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
return false;
// Require that the total load bit width is a legal integer type.
@@ -3621,15 +3703,36 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
Type *SrcTy = LoadPtr->getType()->getPointerElementType();
unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
- LLVMContext &Context = FirstReduced->getContext();
- if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+ if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
return false;
// Everything matched - assume that we can fold the whole sequence using
// load combining.
- LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
- << *(cast<Instruction>(FirstReduced)) << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
+ << *(cast<Instruction>(Root)) << "\n");
+
+ return true;
+}
+
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+ if (RdxOpcode != Instruction::Or)
+ return false;
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+ return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+ // Peek through a final sequence of stores and check if all operations are
+ // likely to be load-combined.
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ for (Value *Scalar : VectorizableTree[0]->Scalars) {
+ Value *X;
+ if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+ !isLoadCombineCandidateImpl(X, NumElts, TTI))
+ return false;
+ }
return true;
}
@@ -3712,7 +3815,7 @@ int BoUpSLP::getSpillCost() const {
if (NumCalls) {
SmallVector<Type*, 4> V;
for (auto *II : LiveValues)
- V.push_back(VectorType::get(II->getType(), BundleWidth));
+ V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
}
@@ -3776,13 +3879,13 @@ int BoUpSLP::getTreeCost() {
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
- auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+ auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto Extend =
MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
- VecTy = VectorType::get(MinTy, BundleWidth);
+ VecTy = FixedVectorType::get(MinTy, BundleWidth);
ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
VecTy, EU.Lane);
} else {
@@ -3809,12 +3912,15 @@ int BoUpSLP::getTreeCost() {
return Cost;
}
-int BoUpSLP::getGatherCost(Type *Ty,
+int BoUpSLP::getGatherCost(VectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
- int Cost = 0;
- for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+ unsigned NumElts = Ty->getNumElements();
+ APInt DemandedElts = APInt::getNullValue(NumElts);
+ for (unsigned i = 0; i < NumElts; ++i)
if (!ShuffledIndices.count(i))
- Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+ DemandedElts.setBit(i);
+ int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+ /*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
@@ -3825,7 +3931,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
// Find the cost of inserting/extracting values from the vector.
// Check if the same elements are inserted several times and count them as
// shuffle candidates.
@@ -3965,9 +4071,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
V = SV->getOperand(0);
} else {
// Reshuffle to get only unique values.
- SmallVector<unsigned, 4> UniqueIdxs;
- SmallSet<unsigned, 4> UsedIdxs;
- for(unsigned Idx : E->ReuseShuffleIndices)
+ SmallVector<int, 4> UniqueIdxs;
+ SmallSet<int, 4> UsedIdxs;
+ for (int Idx : E->ReuseShuffleIndices)
if (UsedIdxs.insert(Idx).second)
UniqueIdxs.emplace_back(Idx);
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
@@ -3984,7 +4090,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
ScalarTy = SI->getValueOperand()->getType();
// Check that every instruction appears once in this bundle.
- SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<int, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
if (VL.size() > 2) {
DenseMap<Value *, unsigned> UniquePositions;
@@ -4002,7 +4108,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
else
VL = UniqueValues;
}
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
Value *V = Gather(VL, VecTy);
if (!ReuseShuffleIndicies.empty()) {
@@ -4017,7 +4123,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}
static void inversePermutation(ArrayRef<unsigned> Indices,
- SmallVectorImpl<unsigned> &Mask) {
+ SmallVectorImpl<int> &Mask) {
Mask.clear();
const unsigned E = Indices.size();
Mask.resize(E);
@@ -4037,7 +4143,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Type *ScalarTy = VL0->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
ScalarTy = SI->getValueOperand()->getType();
- VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+ auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
@@ -4056,6 +4162,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
+ assert(E->State == TreeEntry::Vectorize && "Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
switch (ShuffleOrOp) {
@@ -4096,72 +4203,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
case Instruction::ExtractElement: {
- if (E->State == TreeEntry::Vectorize) {
- Value *V = E->getSingleOperand(0);
- if (!E->ReorderIndices.empty()) {
- OrdersType Mask;
- inversePermutation(E->ReorderIndices, Mask);
- Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- if (E->ReorderIndices.empty())
- Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
- E->VectorizedValue = V;
- return V;
+ Value *V = E->getSingleOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
}
- setInsertPointAfterBundle(E);
- auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
E->ReuseShuffleIndices, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
}
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- if (E->State == TreeEntry::Vectorize) {
- LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
- Builder.SetInsertPoint(LI);
- PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
- LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
- Value *NewV = propagateMetadata(V, E->Scalars);
- if (!E->ReorderIndices.empty()) {
- OrdersType Mask;
- inversePermutation(E->ReorderIndices, Mask);
- NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- NewV = Builder.CreateShuffleVector(
- NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
- }
- E->VectorizedValue = NewV;
- return NewV;
+ LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+ Builder.SetInsertPoint(LI);
+ PointerType *PtrTy =
+ PointerType::get(VecTy, LI->getPointerAddressSpace());
+ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+ "reorder_shuffle");
}
- setInsertPointAfterBundle(E);
- auto *V = Gather(E->Scalars, VecTy);
if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
+ E->ReuseShuffleIndices, "shuffle");
}
- E->VectorizedValue = V;
- return V;
+ E->VectorizedValue = NewV;
+ return NewV;
}
case Instruction::ZExt:
case Instruction::SExt:
@@ -4207,12 +4287,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- Value *V;
- if (E->getOpcode() == Instruction::FCmp)
- V = Builder.CreateFCmp(P0, L, R);
- else
- V = Builder.CreateICmp(P0, L, R);
-
+ Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -4321,7 +4396,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E);
LoadInst *LI = cast<LoadInst>(VL0);
- Type *ScalarLoadTy = LI->getType();
unsigned AS = LI->getPointerAddressSpace();
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
@@ -4334,14 +4408,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (getTreeEntry(PO))
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
- MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
- LI = Builder.CreateLoad(VecTy, VecPtr);
- if (!Alignment)
- Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
- LI->setAlignment(Alignment);
+ LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
Value *V = propagateMetadata(LI, E->Scalars);
if (IsReorder) {
- OrdersType Mask;
+ SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
Mask, "reorder_shuffle");
@@ -4359,23 +4429,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool IsReorder = !E->ReorderIndices.empty();
auto *SI = cast<StoreInst>(
IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
- unsigned Alignment = SI->getAlignment();
unsigned AS = SI->getPointerAddressSpace();
setInsertPointAfterBundle(E);
Value *VecValue = vectorizeTree(E->getOperand(0));
if (IsReorder) {
- OrdersType Mask;
- inversePermutation(E->ReorderIndices, Mask);
+ SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
VecValue = Builder.CreateShuffleVector(
- VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
+ VecValue, UndefValue::get(VecValue->getType()), Mask,
"reorder_shuffle");
}
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
ScalarPtr, VecValue->getType()->getPointerTo(AS));
- StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
+ StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
+ SI->getAlign());
// The pointer operand uses an in-tree scalar, so add the new BitCast to
// ExternalUses to make sure that an extract will be generated in the
@@ -4383,10 +4453,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (getTreeEntry(ScalarPtr))
ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
-
- ST->setAlignment(Align(Alignment));
Value *V = propagateMetadata(ST, E->Scalars);
if (NeedToShuffleReuses) {
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -4445,13 +4511,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+ bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+ VecCallCosts.first <= VecCallCosts.second;
+
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
- if (hasVectorInstrinsicScalarOpd(IID, j)) {
+ if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
@@ -4463,10 +4535,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
OpVecs.push_back(OpVec);
}
- Module *M = F->getParent();
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
- Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+ Function *CF;
+ if (!UseIntrinsic) {
+ VFShape Shape = VFShape::get(
+ *CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
+ false /*HasGlobalPred*/);
+ CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ } else {
+ Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+ CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+ }
+
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
@@ -4527,24 +4606,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// each vector operation.
ValueList OpScalars, AltScalars;
unsigned e = E->Scalars.size();
- SmallVector<Constant *, 8> Mask(e);
+ SmallVector<int, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
auto *OpInst = cast<Instruction>(E->Scalars[i]);
assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
if (OpInst->getOpcode() == E->getAltOpcode()) {
- Mask[i] = Builder.getInt32(e + i);
+ Mask[i] = e + i;
AltScalars.push_back(E->Scalars[i]);
} else {
- Mask[i] = Builder.getInt32(i);
+ Mask[i] = i;
OpScalars.push_back(E->Scalars[i]);
}
}
- Value *ShuffleMask = ConstantVector::get(Mask);
propagateIRFlags(V0, OpScalars);
propagateIRFlags(V1, AltScalars);
- Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+ Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses) {
@@ -4586,7 +4664,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
auto BundleWidth = VectorizableTree[0]->Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto *VecTy = VectorType::get(MinTy, BundleWidth);
+ auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
VectorizableTree[0]->VectorizedValue = Trunc;
}
@@ -4715,6 +4793,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
}
Builder.ClearInsertionPoint();
+ InstrElementSize.clear();
return VectorizableTree[0]->VectorizedValue;
}
@@ -5251,20 +5330,26 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
BS->ScheduleStart = nullptr;
}
-unsigned BoUpSLP::getVectorElementSize(Value *V) const {
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value without
// traversing the expression tree. This is the common case.
if (auto *Store = dyn_cast<StoreInst>(V))
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ auto E = InstrElementSize.find(V);
+ if (E != InstrElementSize.end())
+ return E->second;
+
// If V is not a store, we can traverse the expression tree to find loads
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
SmallVector<Instruction *, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
- if (auto *I = dyn_cast<Instruction>(V))
+ if (auto *I = dyn_cast<Instruction>(V)) {
Worklist.push_back(I);
+ Visited.insert(I);
+ }
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruction we don't yet handle, we give up.
@@ -5272,7 +5357,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
auto FoundUnknownInst = false;
while (!Worklist.empty() && !FoundUnknownInst) {
auto *I = Worklist.pop_back_val();
- Visited.insert(I);
// We should only be looking at scalar instructions here. If the current
// instruction has a vector type, give up.
@@ -5292,7 +5376,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
- if (!Visited.count(J))
+ if (Visited.insert(J).second)
Worklist.push_back(J);
}
@@ -5301,13 +5385,17 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
FoundUnknownInst = true;
}
+ int Width = MaxWidth;
// If we didn't encounter a memory access in the expression tree, or if we
- // gave up for some reason, just return the width of V.
+ // gave up for some reason, just return the width of V. Otherwise, return the
+ // maximum width we found.
if (!MaxWidth || FoundUnknownInst)
- return DL->getTypeSizeInBits(V->getType());
+ Width = DL->getTypeSizeInBits(V->getType());
- // Otherwise, return the maximum width we found.
- return MaxWidth;
+ for (Instruction *I : Visited)
+ InstrElementSize[I] = Width;
+
+ return Width;
}
// Determine if a value V in a vectorizable expression Expr can be demoted to a
@@ -5560,6 +5648,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<InjectTLIMappingsLegacy>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@@ -5598,6 +5687,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_,
OptimizationRemarkEmitter *ORE_) {
+ if (!RunSLPVectorization)
+ return false;
SE = SE_;
TTI = TTI_;
TLI = TLI_;
@@ -5657,7 +5748,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
if (Changed) {
R.optimizeGatherSequence();
LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- LLVM_DEBUG(verifyFunction(F));
}
return Changed;
}
@@ -5688,6 +5778,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
}
if (R.isTreeTinyAndNotFullyVectorizable())
return false;
+ if (R.isLoadCombineCandidate())
+ return false;
R.computeMinimumValueSizes();
@@ -5841,37 +5933,28 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
- Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
+ Value *VL[] = {A, B};
+ return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- int UserCost, bool AllowReorder) {
+ bool AllowReorder,
+ ArrayRef<Value *> InsertUses) {
if (VL.size() < 2)
return false;
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n");
- // Check that all of the parts are scalar instructions of the same type,
+ // Check that all of the parts are instructions of the same type,
// we permit an alternate opcode via InstructionsState.
InstructionsState S = getSameOpcode(VL);
if (!S.getOpcode())
return false;
Instruction *I0 = cast<Instruction>(S.OpValue);
- unsigned Sz = R.getVectorElementSize(I0);
- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
- unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
- if (MaxVF < 2) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
- << "Cannot SLP vectorize list: vectorization factor "
- << "less than 2 is not supported";
- });
- return false;
- }
-
+ // Make sure invalid types (including vector type) are rejected before
+ // determining vectorization factor for scalar instructions.
for (Value *V : VL) {
Type *Ty = V->getType();
if (!isValidElementType(Ty)) {
@@ -5889,16 +5972,35 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
}
}
+ unsigned Sz = R.getVectorElementSize(I0);
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ if (MaxVF < 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
+ }
+
bool Changed = false;
bool CandidateFound = false;
int MinCost = SLPCostThreshold;
+ bool CompensateUseCost =
+ !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+ return V && isa<InsertElementInst>(V);
+ });
+ assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+ "Each scalar expected to have an associated InsertElement user.");
+
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
- auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+ auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
@@ -5940,8 +6042,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost() - UserCost;
+ int Cost = R.getTreeCost();
CandidateFound = true;
+ if (CompensateUseCost) {
+ // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+ // rather than sum of single inserts as the latter may overestimate
+ // cost. This work should imply improving cost estimation for extracts
+ // that added in for external (for vectorization tree) users,i.e. that
+ // part should also switch to same interface.
+ // For example, the following case is projected code after SLP:
+ // %4 = extractelement <4 x i64> %3, i32 0
+ // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+ // %5 = extractelement <4 x i64> %3, i32 1
+ // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+ // %6 = extractelement <4 x i64> %3, i32 2
+ // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+ // %7 = extractelement <4 x i64> %3, i32 3
+ // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+ //
+ // Extracts here added by SLP in order to feed users (the inserts) of
+ // original scalars and contribute to "ExtractCost" at cost evaluation.
+ // The inserts in turn form sequence to build an aggregate that
+ // detected by findBuildAggregate routine.
+ // SLP makes an assumption that such sequence will be optimized away
+ // later (instcombine) so it tries to compensate ExctractCost with
+ // cost of insert sequence.
+ // Current per element cost calculation approach is not quite accurate
+ // and tends to create bias toward favoring vectorization.
+ // Switching to the TTI interface might help a bit.
+ // Alternative solution could be pattern-match to detect a no-op or
+ // shuffle.
+ unsigned UserCost = 0;
+ for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+ auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+ UserCost += TTI->getVectorInstrCost(
+ Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+ << ".\n");
+ Cost -= UserCost;
+ }
+
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
@@ -6031,24 +6173,23 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
-static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
- bool IsPairwise, bool IsLeft,
- IRBuilder<> &Builder) {
+static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
+ unsigned NumEltsToRdx,
+ bool IsPairwise, bool IsLeft) {
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
- SmallVector<Constant *, 32> ShuffleMask(
- VecLen, UndefValue::get(Builder.getInt32Ty()));
+ SmallVector<int, 32> ShuffleMask(VecLen, -1);
if (IsPairwise)
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+ ShuffleMask[i] = 2 * i + !IsLeft;
else
// Move the upper half of the vector to the lower half.
for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+ ShuffleMask[i] = NumEltsToRdx + i;
- return ConstantVector::get(ShuffleMask);
+ return ShuffleMask;
}
namespace {
@@ -6840,7 +6981,7 @@ private:
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
- Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+ auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
int PairwiseRdxCost;
int SplittingRdxCost;
@@ -6857,7 +6998,7 @@ private:
case RK_Max:
case RK_UMin:
case RK_UMax: {
- Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
ReductionData.getKind() == RK_UMax;
PairwiseRdxCost =
@@ -6922,10 +7063,8 @@ private:
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- Value *LeftMask =
- createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
- createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+ auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
+ auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
@@ -6960,20 +7099,16 @@ private:
/// \return true if it matches.
static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
- int &UserCost) {
+ SmallVectorImpl<Value *> &InsertElts) {
assert((isa<InsertElementInst>(LastInsertInst) ||
isa<InsertValueInst>(LastInsertInst)) &&
"Expected insertelement or insertvalue instruction!");
- UserCost = 0;
do {
Value *InsertedOperand;
- if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+ auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
+ if (IE) {
InsertedOperand = IE->getOperand(1);
LastInsertInst = IE->getOperand(0);
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
- UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
- IE->getType(), CI->getZExtValue());
- }
} else {
auto *IV = cast<InsertValueInst>(LastInsertInst);
InsertedOperand = IV->getInsertedValueOperand();
@@ -6981,16 +7116,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
}
if (isa<InsertElementInst>(InsertedOperand) ||
isa<InsertValueInst>(InsertedOperand)) {
- int TmpUserCost;
SmallVector<Value *, 8> TmpBuildVectorOpds;
+ SmallVector<Value *, 8> TmpInsertElts;
if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
- TmpUserCost))
+ TmpInsertElts))
return false;
BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
TmpBuildVectorOpds.rend());
- UserCost += TmpUserCost;
+ InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
} else {
BuildVectorOpds.push_back(InsertedOperand);
+ InsertElts.push_back(IE);
}
if (isa<UndefValue>(LastInsertInst))
break;
@@ -7000,6 +7136,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
return false;
} while (true);
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+ std::reverse(InsertElts.begin(), InsertElts.end());
return true;
}
@@ -7164,26 +7301,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
BasicBlock *BB, BoUpSLP &R) {
- int UserCost = 0;
const DataLayout &DL = BB->getModule()->getDataLayout();
if (!R.canMapToVector(IVI->getType(), DL))
return false;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
+ SmallVector<Value *, 16> BuildVectorInsts;
+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+ BuildVectorOpds.size() < 2)
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+ BuildVectorInsts);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
- int UserCost;
+ SmallVector<Value *, 16> BuildVectorInsts;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
+ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+ BuildVectorOpds.size() < 2 ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
isShuffle(BuildVectorOpds)))
@@ -7191,7 +7331,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+ BuildVectorInsts);
}
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -7228,6 +7369,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -7254,8 +7396,18 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+ Type *EltTy = (*IncIt)->getType();
+ unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
+ : MaxVecRegSize;
+ unsigned MaxNumElts = MaxVecRegSize / EltSize;
+ if (MaxNumElts < 2) {
+ ++IncIt;
+ continue;
+ }
+
while (SameTypeIt != E &&
- (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+ (*SameTypeIt)->getType() == EltTy &&
+ (SameTypeIt - IncIt) < MaxNumElts) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
@@ -7269,8 +7421,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
- if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- /*UserCost=*/0, AllowReorder)) {
+ if (NumElts > 1 &&
+ tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
@@ -7370,9 +7522,12 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
<< Entry.second.size() << ".\n");
// Process the GEP list in chunks suitable for the target's supported
- // vector size. If a vector register can't hold 1 element, we are done.
+ // vector size. If a vector register can't hold 1 element, we are done. We
+ // are trying to vectorize the index computations, so the maximum number of
+ // elements is based on the size of the index expression, rather than the
+ // size of the GEP itself (the target's pointer size).
unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+ unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
if (MaxVecRegSize < EltSize)
continue;
@@ -7475,6 +7630,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 598fb00e956ea..6f055ca80ff29 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -18,7 +18,6 @@ namespace llvm {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
-class TargetTransformInfo;
class TargetLibraryInfo;
/// Helper class to create VPRecipies from IR instructions.
@@ -35,6 +34,8 @@ class VPRecipeBuilder {
/// The profitablity analysis.
LoopVectorizationCostModel &CM;
+ PredicatedScalarEvolution &PSE;
+
VPBuilder &Builder;
/// When we if-convert we need to create edge masks. We have to cache values
@@ -49,11 +50,57 @@ class VPRecipeBuilder {
// VPlan-VPlan transformations support: Hold a mapping from ingredients to
// their recipe. To save on memory, only do so for selected ingredients,
- // marked by having a nullptr entry in this map. If those ingredients get a
- // VPWidenRecipe, also avoid compressing other ingredients into it to avoid
- // having to split such recipes later.
+ // marked by having a nullptr entry in this map.
DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
- VPWidenRecipe *LastExtensibleRecipe = nullptr;
+
+ /// Check if \p I can be widened at the start of \p Range and possibly
+ /// decrease the range such that the returned value holds for the entire \p
+ /// Range. The function should not be called for memory instructions or calls.
+ bool shouldWiden(Instruction *I, VFRange &Range) const;
+
+ /// Check if the load or store instruction \p I should widened for \p
+ /// Range.Start and potentially masked. Such instructions are handled by a
+ /// recipe that takes an additional VPInstruction for the mask.
+ VPWidenMemoryInstructionRecipe *
+ tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+
+ /// Check if an induction recipe should be constructed for \I. If so build and
+ /// return it. If not, return null.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi) const;
+
+ /// Optimize the special case where the operand of \p I is a constant integer
+ /// induction variable.
+ VPWidenIntOrFpInductionRecipe *
+ tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range) const;
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan);
+
+ /// Handle call instructions. If \p CI can be widened for \p Range.Start,
+ /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
+ /// decision from \p Range.Start to \p Range.End.
+ VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range,
+ VPlan &Plan) const;
+
+ /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
+ /// if it can. The function should only be called if the cost-model indicates
+ /// that widening should be performed.
+ VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const;
+
+public:
+ VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM,
+ PredicatedScalarEvolution &PSE, VPBuilder &Builder)
+ : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
+ Builder(Builder) {}
+
+ /// Check if a recipe can be create for \p I withing the given VF \p Range.
+ /// If a recipe can be created, return it. Otherwise return nullptr.
+ VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range,
+ VPlanPtr &Plan);
/// Set the recipe created for given ingredient. This operation is a no-op for
/// ingredients that were not marked using a nullptr entry in the map.
@@ -65,7 +112,6 @@ class VPRecipeBuilder {
Ingredient2Recipe[I] = R;
}
-public:
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
/// mask for the block BB.
@@ -92,48 +138,11 @@ public:
return Ingredient2Recipe[I];
}
- /// Check if \I is a memory instruction to be widened for \p Range.Start and
- /// potentially masked. Such instructions are handled by a recipe that takes
- /// an additional VPInstruction for the mask.
- VPWidenMemoryInstructionRecipe *
- tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
-
- /// Check if an induction recipe should be constructed for \I within the given
- /// VF \p Range. If so build and return it. If not, return null. \p Range.End
- /// may be decreased to ensure same decision from \p Range.Start to
- /// \p Range.End.
- VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
- VFRange &Range);
-
- /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
- /// a sequence of select instructions as the vectorizer currently performs
- /// full if-conversion.
- VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
-
- /// Check if \p I can be widened within the given VF \p Range. If \p I can be
- /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
- /// extended to include \p I or else build a new VPWidenRecipe for it and
- /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
- /// false otherwise. Range.End may be decreased to ensure same decision from
- /// \p Range.Start to \p Range.End.
- bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
-
/// Create a replicating region for instruction \p I that requires
/// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
VPlanPtr &Plan);
-public:
- VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
- LoopVectorizationLegality *Legal,
- LoopVectorizationCostModel &CM, VPBuilder &Builder)
- : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
-
- /// Check if a recipe can be create for \p I withing the given VF \p Range.
- /// If a recipe can be created, it adds it to \p VPBB.
- bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan,
- VPBasicBlock *VPBB);
-
/// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
/// is predicated. \return \p VPBB augmented with this new recipe if \p I is
/// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f1c708720ccf4..f5f28a3bffa18 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -49,13 +49,46 @@ extern cl::opt<bool> EnableVPlanNativePath;
#define DEBUG_TYPE "vplan"
raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
- if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V))
- Instr->print(OS);
- else
- V.printAsOperand(OS);
+ const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
+ VPSlotTracker SlotTracker(
+ (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+ V.print(OS, SlotTracker);
return OS;
}
+void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
+ if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this))
+ Instr->print(OS, SlotTracker);
+ else
+ printAsOperand(OS, SlotTracker);
+}
+
+// Get the top-most entry block of \p Start. This is the entry block of the
+// containing VPlan. This function is templated to support both const and non-const blocks
+template <typename T> static T *getPlanEntry(T *Start) {
+ T *Next = Start;
+ T *Current = Start;
+ while ((Next = Next->getParent()))
+ Current = Next;
+
+ SmallSetVector<T *, 8> WorkList;
+ WorkList.insert(Current);
+
+ for (unsigned i = 0; i < WorkList.size(); i++) {
+ T *Current = WorkList[i];
+ if (Current->getNumPredecessors() == 0)
+ return Current;
+ auto &Predecessors = Current->getPredecessors();
+ WorkList.insert(Predecessors.begin(), Predecessors.end());
+ }
+
+ llvm_unreachable("VPlan without any entry node without predecessors");
+}
+
+VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
+
+const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
+
/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
const VPBlockBase *Block = this;
@@ -71,6 +104,12 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
return cast<VPBasicBlock>(Block);
}
+void VPBlockBase::setPlan(VPlan *ParentPlan) {
+ assert(ParentPlan->getEntry() == this &&
+ "Can only set plan on its entry block.");
+ Plan = ParentPlan;
+}
+
/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
const VPBlockBase *Block = this;
@@ -341,6 +380,20 @@ void VPInstruction::generateInstruction(VPTransformState &State,
State.set(this, V, Part);
break;
}
+ case VPInstruction::ActiveLaneMask: {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
+ // Get first lane of backedge-taken-count.
+ Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
+
+ auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+ auto *PredTy = FixedVectorType::get(Int1Ty, State.VF);
+ Instruction *Call = Builder.CreateIntrinsic(
+ Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
+ {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
+ State.set(this, Call, Part);
+ break;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -352,15 +405,22 @@ void VPInstruction::execute(VPTransformState &State) {
generateInstruction(State, Part);
}
-void VPInstruction::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n" << Indent << "\"EMIT ";
- print(O);
- O << "\\l\"";
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"EMIT ";
+ print(O, SlotTracker);
}
void VPInstruction::print(raw_ostream &O) const {
- printAsOperand(O);
- O << " = ";
+ VPSlotTracker SlotTracker(getParent()->getPlan());
+ print(O, SlotTracker);
+}
+
+void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+ if (hasResult()) {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
switch (getOpcode()) {
case VPInstruction::Not:
@@ -375,13 +435,17 @@ void VPInstruction::print(raw_ostream &O) const {
case VPInstruction::SLPStore:
O << "combined store";
break;
+ case VPInstruction::ActiveLaneMask:
+ O << "active lane mask";
+ break;
+
default:
O << Instruction::getOpcodeName(getOpcode());
}
for (const VPValue *Operand : operands()) {
O << " ";
- Operand->printAsOperand(O);
+ Operand->printAsOperand(O, SlotTracker);
}
}
@@ -395,7 +459,11 @@ void VPlan::execute(VPTransformState *State) {
IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
"trip.count.minus.1");
- Value2VPValue[TCMO] = BackedgeTakenCount;
+ auto VF = State->VF;
+ Value *VTCMO =
+ VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
+ for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
+ State->set(BackedgeTakenCount, VTCMO, Part);
}
// 0. Set the reverse mapping from VPValues to Values for code generation.
@@ -533,15 +601,10 @@ void VPlanPrinter::dump() {
OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
if (!Plan.getName().empty())
OS << "\\n" << DOT::EscapeString(Plan.getName());
- if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
- OS << ", where:";
- if (Plan.BackedgeTakenCount)
- OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount";
- for (auto Entry : Plan.Value2VPValue) {
- OS << "\\n" << *Entry.second;
- OS << DOT::EscapeString(" := ");
- Entry.first->printAsOperand(OS, false);
- }
+ if (Plan.BackedgeTakenCount) {
+ OS << ", where:\\n";
+ Plan.BackedgeTakenCount->print(OS, SlotTracker);
+ OS << " := BackedgeTakenCount";
}
OS << "\"]\n";
OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
@@ -605,25 +668,28 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
if (Pred) {
OS << " +\n" << Indent << " \"BlockPredicate: ";
if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
- PredI->printAsOperand(OS);
+ PredI->printAsOperand(OS, SlotTracker);
OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
<< ")\\l\"";
} else
- Pred->printAsOperand(OS);
+ Pred->printAsOperand(OS, SlotTracker);
}
- for (const VPRecipeBase &Recipe : *BasicBlock)
- Recipe.print(OS, Indent);
+ for (const VPRecipeBase &Recipe : *BasicBlock) {
+ OS << " +\n" << Indent;
+ Recipe.print(OS, Indent, SlotTracker);
+ OS << "\\l\"";
+ }
// Dump the condition bit.
const VPValue *CBV = BasicBlock->getCondBit();
if (CBV) {
OS << " +\n" << Indent << " \"CondBit: ";
if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
- CBI->printAsOperand(OS);
+ CBI->printAsOperand(OS, SlotTracker);
OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
} else {
- CBV->printAsOperand(OS);
+ CBV->printAsOperand(OS, SlotTracker);
OS << "\"";
}
}
@@ -670,83 +736,121 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
O << DOT::EscapeString(IngredientString);
}
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n" << Indent << "\"WIDEN\\l\"";
- for (auto &Instr : make_range(Begin, End))
- O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\"";
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient);
+}
+
+void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient)
+ << (InvariantCond ? " (condition is loop invariant)" : "");
}
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,
- const Twine &Indent) const {
- O << " +\n" << Indent << "\"WIDEN-INDUCTION";
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN\\l\"";
+ O << "\" " << VPlanIngredient(&Ingredient);
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN-INDUCTION";
if (Trunc) {
O << "\\l\"";
O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc);
} else
- O << " " << VPlanIngredient(IV) << "\\l\"";
+ O << " " << VPlanIngredient(IV);
}
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n" << Indent << "\"WIDEN-GEP ";
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN-GEP ";
O << (IsPtrLoopInvariant ? "Inv" : "Var");
size_t IndicesNumber = IsIndexLoopInvariant.size();
for (size_t I = 0; I < IndicesNumber; ++I)
O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
O << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(GEP);
}
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN-PHI " << VPlanIngredient(Phi);
}
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n" << Indent << "\"BLEND ";
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"BLEND ";
Phi->printAsOperand(O, false);
O << " =";
- if (!User) {
+ if (getNumIncomingValues() == 1) {
// Not a User of any mask: not really blending, this is a
// single-predecessor phi.
O << " ";
- Phi->getIncomingValue(0)->printAsOperand(O, false);
+ getIncomingValue(0)->printAsOperand(O, SlotTracker);
} else {
- for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
+ for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
O << " ";
- Phi->getIncomingValue(I)->printAsOperand(O, false);
+ getIncomingValue(I)->printAsOperand(O, SlotTracker);
O << "/";
- User->getOperand(I)->printAsOperand(O);
+ getMask(I)->printAsOperand(O, SlotTracker);
}
}
- O << "\\l\"";
}
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n"
- << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
<< VPlanIngredient(Ingredient);
if (AlsoPack)
O << " (S->V)";
- O << "\\l\"";
}
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
- O << " +\n"
- << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst)
- << "\\l\"";
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst);
}
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
- const Twine &Indent) const {
- O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"WIDEN " << VPlanIngredient(&Instr);
O << ", ";
- getAddr()->printAsOperand(O);
+ getAddr()->printAsOperand(O, SlotTracker);
VPValue *Mask = getMask();
if (Mask) {
O << ", ";
- Mask->printAsOperand(O);
+ Mask->printAsOperand(O, SlotTracker);
}
- O << "\\l\"";
+}
+
+void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
+ Value *CanonicalIV = State.CanonicalIV;
+ Type *STy = CanonicalIV->getType();
+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+ auto VF = State.VF;
+ Value *VStart = VF == 1
+ ? CanonicalIV
+ : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+ SmallVector<Constant *, 8> Indices;
+ for (unsigned Lane = 0; Lane < VF; ++Lane)
+ Indices.push_back(ConstantInt::get(STy, Part * VF + Lane));
+ // If VF == 1, there is only one iteration in the loop above, thus the
+ // element pushed back into Indices is ConstantInt::get(STy, Part)
+ Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices);
+ // Add the consecutive indices to the vector value.
+ Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
+ State.set(getVPValue(), CanonicalVectorIV, Part);
+ }
+}
+
+void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"EMIT ";
+ getVPValue()->printAsOperand(O, SlotTracker);
+ O << " = WIDEN-CANONICAL-INDUCTION";
}
template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
@@ -758,6 +862,21 @@ void VPValue::replaceAllUsesWith(VPValue *New) {
User->setOperand(I, New);
}
+void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
+ if (const Value *UV = getUnderlyingValue()) {
+ OS << "ir<";
+ UV->printAsOperand(OS, false);
+ OS << ">";
+ return;
+ }
+
+ unsigned Slot = Tracker.getSlot(this);
+ if (Slot == unsigned(-1))
+ OS << "<badref>";
+ else
+ OS << "vp<%" << Tracker.getSlot(this) << ">";
+}
+
void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
Old2NewTy &Old2New,
InterleavedAccessInfo &IAI) {
@@ -781,7 +900,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
auto NewIGIter = Old2New.find(IG);
if (NewIGIter == Old2New.end())
Old2New[IG] = new InterleaveGroup<VPInstruction>(
- IG->getFactor(), IG->isReverse(), Align(IG->getAlignment()));
+ IG->getFactor(), IG->isReverse(), IG->getAlign());
if (Inst == IG->getInsertPos())
Old2New[IG]->setInsertPos(VPInst);
@@ -803,3 +922,57 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
Old2NewTy Old2New;
visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
}
+
+void VPSlotTracker::assignSlot(const VPValue *V) {
+ assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
+ const Value *UV = V->getUnderlyingValue();
+ if (UV)
+ return;
+ const auto *VPI = dyn_cast<VPInstruction>(V);
+ if (VPI && !VPI->hasResult())
+ return;
+
+ Slots[V] = NextSlot++;
+}
+
+void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) {
+ if (auto *Region = dyn_cast<VPRegionBlock>(VPBB))
+ assignSlots(Region);
+ else
+ assignSlots(cast<VPBasicBlock>(VPBB));
+}
+
+void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
+ ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry());
+ for (const VPBlockBase *Block : RPOT)
+ assignSlots(Block);
+}
+
+void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
+ for (const VPRecipeBase &Recipe : *VPBB) {
+ if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe))
+ assignSlot(VPI);
+ else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe))
+ assignSlot(VPIV->getVPValue());
+ }
+}
+
+void VPSlotTracker::assignSlots(const VPlan &Plan) {
+
+ for (const VPValue *V : Plan.VPExternalDefs)
+ assignSlot(V);
+
+ for (auto &E : Plan.Value2VPValue)
+ if (!isa<VPInstruction>(E.second))
+ assignSlot(E.second);
+
+ for (const VPValue *V : Plan.VPCBVs)
+ assignSlot(V);
+
+ if (Plan.BackedgeTakenCount)
+ assignSlot(Plan.BackedgeTakenCount);
+
+ ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry());
+ for (const VPBlockBase *Block : RPOT)
+ assignSlots(Block);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c65abc3639d73..f07c94e7a3c7d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -48,8 +48,6 @@
namespace llvm {
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
class BasicBlock;
class DominatorTree;
class InnerLoopVectorizer;
@@ -59,6 +57,7 @@ class raw_ostream;
class Value;
class VPBasicBlock;
class VPRegionBlock;
+class VPSlotTracker;
class VPlan;
class VPlanSlp;
@@ -271,10 +270,20 @@ struct VPTransformState {
return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
}
- /// Get the generated Value for a given VPValue and given Part and Lane. Note
- /// that as per-lane Defs are still created by ILV and managed in its ValueMap
- /// this method currently just delegates the call to ILV.
+ /// Get the generated Value for a given VPValue and given Part and Lane.
Value *get(VPValue *Def, const VPIteration &Instance) {
+ // If the Def is managed directly by VPTransformState, extract the lane from
+ // the relevant part. Note that currently only VPInstructions and external
+ // defs are managed by VPTransformState. Other Defs are still created by ILV
+ // and managed in its ValueMap. For those this method currently just
+ // delegates the call to ILV below.
+ if (Data.PerPartOutput.count(Def)) {
+ auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
+ // TODO: Cache created scalar values.
+ return Builder.CreateExtractElement(VecPart,
+ Builder.getInt32(Instance.Lane));
+ }
+
return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
}
@@ -329,6 +338,9 @@ struct VPTransformState {
/// Values they correspond to.
VPValue2ValueTy VPValue2Value;
+ /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
+ Value *CanonicalIV = nullptr;
+
/// Hold the trip count of the scalar loop.
Value *TripCount = nullptr;
@@ -343,7 +355,6 @@ struct VPTransformState {
class VPBlockBase {
friend class VPBlockUtils;
-private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
/// An optional name for the block.
@@ -365,6 +376,10 @@ private:
/// Current block predicate - null if the block does not need a predicate.
VPValue *Predicate = nullptr;
+ /// VPlan containing the block. Can only be set on the entry block of the
+ /// plan.
+ VPlan *Plan = nullptr;
+
/// Add \p Successor as the last successor to this block.
void appendSuccessor(VPBlockBase *Successor) {
assert(Successor && "Cannot add nullptr successor!");
@@ -418,6 +433,14 @@ public:
VPRegionBlock *getParent() { return Parent; }
const VPRegionBlock *getParent() const { return Parent; }
+ /// \return A pointer to the plan containing the current block.
+ VPlan *getPlan();
+ const VPlan *getPlan() const;
+
+ /// Sets the pointer of the plan containing the block. The block must be the
+ /// entry block into the VPlan.
+ void setPlan(VPlan *ParentPlan);
+
void setParent(VPRegionBlock *P) { Parent = P; }
/// \return the VPBasicBlock that is the entry of this VPBlockBase,
@@ -579,7 +602,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
friend VPBasicBlock;
friend class VPBlockUtils;
-private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
/// Each VPRecipe belongs to a single VPBasicBlock.
@@ -597,11 +619,14 @@ public:
VPInterleaveSC,
VPPredInstPHISC,
VPReplicateSC,
+ VPWidenCallSC,
+ VPWidenCanonicalIVSC,
VPWidenGEPSC,
VPWidenIntOrFpInductionSC,
VPWidenMemoryInstructionSC,
VPWidenPHISC,
VPWidenSC,
+ VPWidenSelectSC
};
VPRecipeBase(const unsigned char SC) : SubclassID(SC) {}
@@ -621,7 +646,8 @@ public:
virtual void execute(struct VPTransformState &State) = 0;
/// Each recipe prints itself.
- virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+ virtual void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const = 0;
/// Insert an unlinked recipe into a basic block immediately before
/// the specified recipe.
@@ -659,6 +685,7 @@ public:
ICmpULE,
SLPLoad,
SLPStore,
+ ActiveLaneMask,
};
private:
@@ -707,10 +734,12 @@ public:
void execute(VPTransformState &State) override;
/// Print the Recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
/// Print the VPInstruction.
void print(raw_ostream &O) const;
+ void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
/// Return true if this instruction may modify memory.
bool mayWriteToMemory() const {
@@ -719,23 +748,42 @@ public:
return Opcode == Instruction::Store || Opcode == Instruction::Call ||
Opcode == Instruction::Invoke || Opcode == SLPStore;
}
+
+ bool hasResult() const {
+ // CallInst may or may not have a result, depending on the called function.
+ // Conservatively return calls have results for now.
+ switch (getOpcode()) {
+ case Instruction::Ret:
+ case Instruction::Br:
+ case Instruction::Store:
+ case Instruction::Switch:
+ case Instruction::IndirectBr:
+ case Instruction::Resume:
+ case Instruction::CatchRet:
+ case Instruction::Unreachable:
+ case Instruction::Fence:
+ case Instruction::AtomicRMW:
+ return false;
+ default:
+ return true;
+ }
+ }
};
-/// VPWidenRecipe is a recipe for producing a copy of vector type for each
-/// Instruction in its ingredients independently, in order. This recipe covers
-/// most of the traditional vectorization cases where each ingredient transforms
-/// into a vectorized version of itself.
+/// VPWidenRecipe is a recipe for producing a copy of vector type its
+/// ingredient. This recipe covers most of the traditional vectorization cases
+/// where each ingredient transforms into a vectorized version of itself.
class VPWidenRecipe : public VPRecipeBase {
-private:
- /// Hold the ingredients by pointing to their original BasicBlock location.
- BasicBlock::iterator Begin;
- BasicBlock::iterator End;
+ /// Hold the instruction to be widened.
+ Instruction &Ingredient;
+
+ /// Hold VPValues for the operands of the ingredient.
+ VPUser User;
public:
- VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) {
- End = I->getIterator();
- Begin = End++;
- }
+ template <typename IterT>
+ VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
+ : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {}
~VPWidenRecipe() override = default;
@@ -747,28 +795,88 @@ public:
/// Produce widened copies of all Ingredients.
void execute(VPTransformState &State) override;
- /// Augment the recipe to include Instr, if it lies at its End.
- bool appendInstruction(Instruction *Instr) {
- if (End != Instr->getIterator())
- return false;
- End++;
- return true;
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening Call instructions.
+class VPWidenCallRecipe : public VPRecipeBase {
+ /// Hold the call to be widened.
+ CallInst &Ingredient;
+
+ /// Hold VPValues for the arguments of the call.
+ VPUser User;
+
+public:
+ template <typename IterT>
+ VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
+ : VPRecipeBase(VPWidenCallSC), Ingredient(I), User(CallArguments) {}
+
+ ~VPWidenCallRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC;
}
+ /// Produce a widened version of the call instruction.
+ void execute(VPTransformState &State) override;
+
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening select instructions.
+class VPWidenSelectRecipe : public VPRecipeBase {
+private:
+ /// Hold the select to be widened.
+ SelectInst &Ingredient;
+
+ /// Hold VPValues for the operands of the select.
+ VPUser User;
+
+ /// Is the condition of the select loop invariant?
+ bool InvariantCond;
+
+public:
+ template <typename IterT>
+ VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
+ bool InvariantCond)
+ : VPRecipeBase(VPWidenSelectSC), Ingredient(I), User(Operands),
+ InvariantCond(InvariantCond) {}
+
+ ~VPWidenSelectRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC;
+ }
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// A recipe for handling GEP instructions.
class VPWidenGEPRecipe : public VPRecipeBase {
-private:
GetElementPtrInst *GEP;
+
+ /// Hold VPValues for the base and indices of the GEP.
+ VPUser User;
+
bool IsPtrLoopInvariant;
SmallBitVector IsIndexLoopInvariant;
public:
- VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop)
- : VPRecipeBase(VPWidenGEPSC), GEP(GEP),
+ template <typename IterT>
+ VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
+ Loop *OrigLoop)
+ : VPRecipeBase(VPWidenGEPSC), GEP(GEP), User(Operands),
IsIndexLoopInvariant(GEP->getNumIndices(), false) {
IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
for (auto Index : enumerate(GEP->indices()))
@@ -786,13 +894,13 @@ public:
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
-private:
PHINode *IV;
TruncInst *Trunc;
@@ -811,12 +919,12 @@ public:
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// A recipe for handling all phi nodes except for integer and FP inductions.
class VPWidenPHIRecipe : public VPRecipeBase {
-private:
PHINode *Phi;
public:
@@ -832,26 +940,27 @@ public:
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPRecipeBase {
-private:
PHINode *Phi;
- /// The blend operation is a User of a mask, if not null.
- std::unique_ptr<VPUser> User;
+ /// The blend operation is a User of the incoming values and of their
+ /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
+ /// might be incoming with a full mask for which there is no VPValue.
+ VPUser User;
public:
- VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks)
- : VPRecipeBase(VPBlendSC), Phi(Phi) {
- assert((Phi->getNumIncomingValues() == 1 ||
- Phi->getNumIncomingValues() == Masks.size()) &&
- "Expected the same number of incoming values and masks");
- if (!Masks.empty())
- User.reset(new VPUser(Masks));
+ VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
+ : VPRecipeBase(VPBlendSC), Phi(Phi), User(Operands) {
+ assert(Operands.size() > 0 &&
+ ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
+ "Expected either a single incoming value or a positive even number "
+ "of operands");
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -859,17 +968,31 @@ public:
return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
}
+ /// Return the number of incoming values, taking into account that a single
+ /// incoming value has no mask.
+ unsigned getNumIncomingValues() const {
+ return (User.getNumOperands() + 1) / 2;
+ }
+
+ /// Return incoming value number \p Idx.
+ VPValue *getIncomingValue(unsigned Idx) const {
+ return User.getOperand(Idx * 2);
+ }
+
+ /// Return mask number \p Idx.
+ VPValue *getMask(unsigned Idx) const { return User.getOperand(Idx * 2 + 1); }
+
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
/// or stores into one wide load/store and shuffles.
class VPInterleaveRecipe : public VPRecipeBase {
-private:
const InterleaveGroup<Instruction> *IG;
VPUser User;
@@ -903,7 +1026,8 @@ public:
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
};
@@ -913,10 +1037,12 @@ public:
/// single copy of widened type for all lanes. If the instruction is known to be
/// uniform only one copy, per lane zero, will be generated.
class VPReplicateRecipe : public VPRecipeBase {
-private:
/// The instruction being replicated.
Instruction *Ingredient;
+ /// Hold VPValues for the operands of the ingredient.
+ VPUser User;
+
/// Indicator if only a single replica per lane is needed.
bool IsUniform;
@@ -927,9 +1053,11 @@ private:
bool AlsoPack;
public:
- VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false)
- : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform),
- IsPredicated(IsPredicated) {
+ template <typename IterT>
+ VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
+ bool IsUniform, bool IsPredicated = false)
+ : VPRecipeBase(VPReplicateSC), Ingredient(I), User(Operands),
+ IsUniform(IsUniform), IsPredicated(IsPredicated) {
// Retain the previous behavior of predicateInstructions(), where an
// insert-element of a predicated instruction got hoisted into the
// predicated basic block iff it was its only user. This is achieved by
@@ -953,18 +1081,18 @@ public:
void setAlsoPack(bool Pack) { AlsoPack = Pack; }
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// A recipe for generating conditional branches on the bits of a mask.
class VPBranchOnMaskRecipe : public VPRecipeBase {
-private:
- std::unique_ptr<VPUser> User;
+ VPUser User;
public:
VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
if (BlockInMask) // nullptr means all-one mask.
- User.reset(new VPUser({BlockInMask}));
+ User.addOperand(BlockInMask);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -977,14 +1105,23 @@ public:
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override {
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override {
O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
- if (User)
- O << *User->getOperand(0);
+ if (VPValue *Mask = getMask())
+ Mask->print(O, SlotTracker);
else
O << " All-One";
O << "\\l\"";
}
+
+ /// Return the mask used by this recipe. Note that a full mask is represented
+ /// by a nullptr.
+ VPValue *getMask() const {
+ assert(User.getNumOperands() <= 1 && "should have either 0 or 1 operands");
+ // Mask is optional.
+ return User.getNumOperands() == 1 ? User.getOperand(0) : nullptr;
+ }
};
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -993,7 +1130,6 @@ public:
/// The phi nodes can be scalar or vector depending on the users of the value.
/// This recipe works in concert with VPBranchOnMaskRecipe.
class VPPredInstPHIRecipe : public VPRecipeBase {
-private:
Instruction *PredInst;
public:
@@ -1012,23 +1148,42 @@ public:
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// A Recipe for widening load/store operations.
+/// The recipe uses the following VPValues:
+/// - For load: Address, optional mask
+/// - For store: Address, stored value, optional mask
/// TODO: We currently execute only per-part unless a specific instance is
/// provided.
class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
-private:
Instruction &Instr;
VPUser User;
+ void setMask(VPValue *Mask) {
+ if (!Mask)
+ return;
+ User.addOperand(Mask);
+ }
+
+ bool isMasked() const {
+ return (isa<LoadInst>(Instr) && User.getNumOperands() == 2) ||
+ (isa<StoreInst>(Instr) && User.getNumOperands() == 3);
+ }
+
public:
- VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr,
- VPValue *Mask)
- : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) {
- if (Mask)
- User.addOperand(Mask);
+ VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
+ : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Load), User({Addr}) {
+ setMask(Mask);
+ }
+
+ VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+ VPValue *StoredValue, VPValue *Mask)
+ : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Store),
+ User({Addr, StoredValue}) {
+ setMask(Mask);
}
/// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1044,15 +1199,52 @@ public:
/// Return the mask used by this recipe. Note that a full mask is represented
/// by a nullptr.
VPValue *getMask() const {
- // Mask is optional and therefore the last, currently 2nd operand.
- return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;
+ // Mask is optional and therefore the last operand.
+ return isMasked() ? User.getOperand(User.getNumOperands() - 1) : nullptr;
+ }
+
+ /// Return the address accessed by this recipe.
+ VPValue *getStoredValue() const {
+ assert(isa<StoreInst>(Instr) &&
+ "Stored value only available for store instructions");
+ return User.getOperand(1); // Stored value is the 2nd, mandatory operand.
}
/// Generate the wide load/store.
void execute(VPTransformState &State) override;
/// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent) const override;
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A Recipe for widening the canonical induction variable of the vector loop.
+class VPWidenCanonicalIVRecipe : public VPRecipeBase {
+ /// A VPValue representing the canonical vector IV.
+ VPValue Val;
+
+public:
+ VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {}
+ ~VPWidenCanonicalIVRecipe() override = default;
+
+ /// Return the VPValue representing the canonical vector induction variable of
+ /// the vector loop.
+ const VPValue *getVPValue() const { return &Val; }
+ VPValue *getVPValue() { return &Val; }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC;
+ }
+
+ /// Generate a canonical vector induction variable of the vector loop, with
+ /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
+ /// step = <VF*UF, VF*UF, ..., VF*UF>.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
};
/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -1144,7 +1336,6 @@ private:
/// candidate VF's. The actual replication takes place only once the desired VF
/// and UF have been determined.
class VPRegionBlock : public VPBlockBase {
-private:
/// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
VPBlockBase *Entry;
@@ -1347,8 +1538,8 @@ struct GraphTraits<Inverse<VPRegionBlock *>>
/// VPBlock.
class VPlan {
friend class VPlanPrinter;
+ friend class VPSlotTracker;
-private:
/// Hold the single entry to the Hierarchical CFG of the VPlan.
VPBlockBase *Entry;
@@ -1380,16 +1571,18 @@ private:
SmallVector<VPValue *, 4> VPCBVs;
public:
- VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {}
+ VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+ if (Entry)
+ Entry->setPlan(this);
+ }
~VPlan() {
if (Entry)
VPBlockBase::deleteCFG(Entry);
for (auto &MapEntry : Value2VPValue)
- if (MapEntry.second != BackedgeTakenCount)
- delete MapEntry.second;
+ delete MapEntry.second;
if (BackedgeTakenCount)
- delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not.
+ delete BackedgeTakenCount;
for (VPValue *Def : VPExternalDefs)
delete Def;
for (VPValue *CBV : VPCBVs)
@@ -1402,7 +1595,11 @@ public:
VPBlockBase *getEntry() { return Entry; }
const VPBlockBase *getEntry() const { return Entry; }
- VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
+ VPBlockBase *setEntry(VPBlockBase *Block) {
+ Entry = Block;
+ Block->setPlan(this);
+ return Entry;
+ }
/// The backedge taken count of the original loop.
VPValue *getOrCreateBackedgeTakenCount() {
@@ -1433,7 +1630,7 @@ public:
void addVPValue(Value *V) {
assert(V && "Trying to add a null Value to VPlan");
assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
- Value2VPValue[V] = new VPValue();
+ Value2VPValue[V] = new VPValue(V);
}
VPValue *getVPValue(Value *V) {
@@ -1456,6 +1653,16 @@ public:
/// Dump the plan to stderr (for debugging).
void dump() const;
+ /// Returns a range mapping the values the range \p Operands to their
+ /// corresponding VPValues.
+ iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
+ mapToVPValues(User::op_range Operands) {
+ std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
+ return getOrAddVPValue(Op);
+ };
+ return map_range(Operands, Fn);
+ }
+
private:
/// Add to the given dominator tree the header block and every new basic block
/// that was created between it and the latch block, inclusive.
@@ -1480,7 +1687,10 @@ private:
unsigned BID = 0;
SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
- VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {}
+ VPSlotTracker SlotTracker;
+
+ VPlanPrinter(raw_ostream &O, const VPlan &P)
+ : OS(O), Plan(P), SlotTracker(&P) {}
/// Handle indentation.
void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
@@ -1635,7 +1845,6 @@ public:
};
class VPInterleavedAccessInfo {
-private:
DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
InterleaveGroupMap;
@@ -1679,7 +1888,6 @@ public:
/// Class that maps (parts of) an existing VPlan to trees of combined
/// VPInstructions.
class VPlanSlp {
-private:
enum class OpMode { Failed, Load, Opcode };
/// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 19f5d2c00c604..a42ebc9ee955f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -30,7 +30,8 @@ using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
/// Template specializations of GraphTraits for VPDomTreeNode.
template <>
struct GraphTraits<VPDomTreeNode *>
- : public DomTreeGraphTraitsBase<VPDomTreeNode, VPDomTreeNode::iterator> {};
+ : public DomTreeGraphTraitsBase<VPDomTreeNode,
+ VPDomTreeNode::const_iterator> {};
template <>
struct GraphTraits<const VPDomTreeNode *>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3f6a2efd55ccb..3a4872a721221 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
void VPlanTransforms::VPInstructionsToVPRecipes(
Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList *Inductions,
+ LoopVectorizationLegality::InductionList &Inductions,
SmallPtrSetImpl<Instruction *> &DeadInstructions) {
auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
@@ -41,7 +41,6 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
continue;
VPBasicBlock *VPBB = Base->getEntryBasicBlock();
- VPRecipeBase *LastRecipe = nullptr;
// Introduce each ingredient into VPlan.
for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
VPRecipeBase *Ingredient = &*I++;
@@ -55,33 +54,29 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
VPRecipeBase *NewRecipe = nullptr;
// Create VPWidenMemoryInstructionRecipe for loads and stores.
- if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+ if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
NewRecipe = new VPWidenMemoryInstructionRecipe(
- *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+ *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
nullptr /*Mask*/);
+ else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+ NewRecipe = new VPWidenMemoryInstructionRecipe(
+ *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+ Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/);
else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
- InductionDescriptor II = Inductions->lookup(Phi);
+ InductionDescriptor II = Inductions.lookup(Phi);
if (II.getKind() == InductionDescriptor::IK_IntInduction ||
II.getKind() == InductionDescriptor::IK_FpInduction) {
NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
} else
NewRecipe = new VPWidenPHIRecipe(Phi);
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
- NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop);
- } else {
- // If the last recipe is a VPWidenRecipe, add Inst to it instead of
- // creating a new recipe.
- if (VPWidenRecipe *WidenRecipe =
- dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) {
- WidenRecipe->appendInstruction(Inst);
- Ingredient->eraseFromParent();
- continue;
- }
- NewRecipe = new VPWidenRecipe(Inst);
- }
+ NewRecipe = new VPWidenGEPRecipe(
+ GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+ } else
+ NewRecipe =
+ new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
NewRecipe->insertBefore(Ingredient);
- LastRecipe = NewRecipe;
Ingredient->eraseFromParent();
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 0d3bd7da09a70..4b20e8b4e3b31 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -19,14 +19,12 @@
namespace llvm {
-class VPlanTransforms {
-
-public:
+struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
/// widen recipes.
static void VPInstructionsToVPRecipes(
Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList *Inductions,
+ LoopVectorizationLegality::InductionList &Inductions,
SmallPtrSetImpl<Instruction *> &DeadInstructions);
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 464498c29d89e..f73505d0279af 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -22,13 +22,14 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/iterator_range.h"
namespace llvm {
// Forward declarations.
+class raw_ostream;
+class Value;
+class VPSlotTracker;
class VPUser;
// This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -37,11 +38,11 @@ class VPUser;
// and live-outs which the VPlan will need to fix accordingly.
class VPValue {
friend class VPBuilder;
- friend class VPlanTransforms;
+ friend struct VPlanTransforms;
friend class VPBasicBlock;
friend class VPInterleavedAccessInfo;
+ friend class VPSlotTracker;
-private:
const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
SmallVector<VPUser *, 1> Users;
@@ -62,6 +63,7 @@ protected:
/// Return the underlying Value attached to this VPValue.
Value *getUnderlyingValue() { return UnderlyingVal; }
+ const Value *getUnderlyingValue() const { return UnderlyingVal; }
// Set \p Val as the underlying Value of this VPValue.
void setUnderlyingValue(Value *Val) {
@@ -85,9 +87,8 @@ public:
/// for any other purpose, as the values may change as LLVM evolves.
unsigned getVPValueID() const { return SubclassID; }
- void printAsOperand(raw_ostream &OS) const {
- OS << "%vp" << (unsigned short)(unsigned long long)this;
- }
+ void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
+ void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
unsigned getNumUsers() const { return Users.size(); }
void addUser(VPUser &User) { Users.push_back(&User); }
@@ -129,7 +130,6 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
/// This class augments VPValue with operands which provide the inverse def-use
/// edges from VPValue's users to their defs.
class VPUser : public VPValue {
-private:
SmallVector<VPValue *, 2> Operands;
protected:
@@ -144,6 +144,12 @@ public:
VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {}
VPUser(std::initializer_list<VPValue *> Operands)
: VPUser(ArrayRef<VPValue *>(Operands)) {}
+ template <typename IterT>
+ VPUser(iterator_range<IterT> Operands) : VPValue(VPValue::VPUserSC) {
+ for (VPValue *Operand : Operands)
+ addOperand(Operand);
+ }
+
VPUser(const VPUser &) = delete;
VPUser &operator=(const VPUser &) = delete;
@@ -180,6 +186,37 @@ public:
return const_operand_range(op_begin(), op_end());
}
};
+class VPlan;
+class VPBasicBlock;
+class VPRegionBlock;
+
+/// This class can be used to assign consecutive numbers to all VPValues in a
+/// VPlan and allows querying the numbering for printing, similar to the
+/// ModuleSlotTracker for IR values.
+class VPSlotTracker {
+ DenseMap<const VPValue *, unsigned> Slots;
+ unsigned NextSlot = 0;
+
+ void assignSlots(const VPBlockBase *VPBB);
+ void assignSlots(const VPRegionBlock *Region);
+ void assignSlots(const VPBasicBlock *VPBB);
+ void assignSlot(const VPValue *V);
+
+ void assignSlots(const VPlan &Plan);
+
+public:
+ VPSlotTracker(const VPlan *Plan) {
+ if (Plan)
+ assignSlots(*Plan);
+ }
+
+ unsigned getSlot(const VPValue *V) const {
+ auto I = Slots.find(V);
+ if (I == Slots.end())
+ return -1;
+ return I->second;
+ }
+};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index ab3e7e2282e77..b384c94121e9b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "VPlanVerifier.h"
+#include "VPlan.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
index 7d2b262521723..8e8de441648ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -24,14 +24,12 @@
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
-#include "VPlan.h"
-
namespace llvm {
+class VPRegionBlock;
-/// Class with utility functions that can be used to check the consistency and
+/// Struct with utility functions that can be used to check the consistency and
/// invariants of a VPlan, including the components of its H-CFG.
-class VPlanVerifier {
-public:
+struct VPlanVerifier {
/// Verify the invariants of the H-CFG starting from \p TopRegion. The
/// verification process comprises the following steps:
/// 1. Region/Block verification: Check the Region/Block verification
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
new file mode 100644
index 0000000000000..64b41bf9cefa8
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -0,0 +1,699 @@
+//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes scalar/vector interactions using target cost models. The
+// transforms implemented here may not fit in traditional loop-based or SLP
+// vectorization passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
+STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
+STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
+STATISTIC(NumScalarBO, "Number of scalar binops formed");
+STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+
+static cl::opt<bool> DisableVectorCombine(
+ "disable-vector-combine", cl::init(false), cl::Hidden,
+ cl::desc("Disable all vector combine transforms"));
+
+static cl::opt<bool> DisableBinopExtractShuffle(
+ "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
+ cl::desc("Disable binop extract to shuffle transforms"));
+
+static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
+
+namespace {
+class VectorCombine {
+public:
+ VectorCombine(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT)
+ : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
+
+ bool run();
+
+private:
+ Function &F;
+ IRBuilder<> Builder;
+ const TargetTransformInfo &TTI;
+ const DominatorTree &DT;
+
+ ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1,
+ unsigned PreferredExtractIndex) const;
+ bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ unsigned Opcode,
+ ExtractElementInst *&ConvertToShuffle,
+ unsigned PreferredExtractIndex);
+ void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ Instruction &I);
+ void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ Instruction &I);
+ bool foldExtractExtract(Instruction &I);
+ bool foldBitcastShuf(Instruction &I);
+ bool scalarizeBinopOrCmp(Instruction &I);
+ bool foldExtractedCmps(Instruction &I);
+};
+} // namespace
+
+static void replaceValue(Value &Old, Value &New) {
+ Old.replaceAllUsesWith(&New);
+ New.takeName(&Old);
+}
+
+/// Determine which, if any, of the inputs should be replaced by a shuffle
+/// followed by extract from a different index.
+ExtractElementInst *VectorCombine::getShuffleExtract(
+ ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ unsigned PreferredExtractIndex = InvalidIndex) const {
+ assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
+ isa<ConstantInt>(Ext1->getIndexOperand()) &&
+ "Expected constant extract indexes");
+
+ unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
+ unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+
+ // If the extract indexes are identical, no shuffle is needed.
+ if (Index0 == Index1)
+ return nullptr;
+
+ Type *VecTy = Ext0->getVectorOperand()->getType();
+ assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
+ int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+ int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+
+ // We are extracting from 2 different indexes, so one operand must be shuffled
+ // before performing a vector operation and/or extract. The more expensive
+ // extract will be replaced by a shuffle.
+ if (Cost0 > Cost1)
+ return Ext0;
+ if (Cost1 > Cost0)
+ return Ext1;
+
+ // If the costs are equal and there is a preferred extract index, shuffle the
+ // opposite operand.
+ if (PreferredExtractIndex == Index0)
+ return Ext1;
+ if (PreferredExtractIndex == Index1)
+ return Ext0;
+
+ // Otherwise, replace the extract with the higher index.
+ return Index0 > Index1 ? Ext0 : Ext1;
+}
+
+/// Compare the relative costs of 2 extracts followed by scalar operation vs.
+/// vector operation(s) followed by extract. Return true if the existing
+/// instructions are cheaper than a vector alternative. Otherwise, return false
+/// and if one of the extracts should be transformed to a shufflevector, set
+/// \p ConvertToShuffle to that extract instruction.
+bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1,
+ unsigned Opcode,
+ ExtractElementInst *&ConvertToShuffle,
+ unsigned PreferredExtractIndex) {
+ assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
+ isa<ConstantInt>(Ext1->getOperand(1)) &&
+ "Expected constant extract indexes");
+ Type *ScalarTy = Ext0->getType();
+ auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
+ int ScalarOpCost, VectorOpCost;
+
+ // Get cost estimates for scalar and vector versions of the operation.
+ bool IsBinOp = Instruction::isBinaryOp(Opcode);
+ if (IsBinOp) {
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ } else {
+ assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ "Expected a compare");
+ ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
+ CmpInst::makeCmpResultType(VecTy));
+ }
+
+ // Get cost estimates for the extract elements. These costs will factor into
+ // both sequences.
+ unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
+ unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+
+ int Extract0Cost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
+ int Extract1Cost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+
+ // A more expensive extract will always be replaced by a splat shuffle.
+ // For example, if Ext0 is more expensive:
+ // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
+ // extelt (opcode (splat V0, Ext0), V1), Ext1
+ // TODO: Evaluate whether that always results in lowest cost. Alternatively,
+ // check the cost of creating a broadcast shuffle and shuffling both
+ // operands to element 0.
+ int CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
+
+ // Extra uses of the extracts mean that we include those costs in the
+ // vector total because those instructions will not be eliminated.
+ int OldCost, NewCost;
+ if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
+ // Handle a special case. If the 2 extracts are identical, adjust the
+ // formulas to account for that. The extra use charge allows for either the
+ // CSE'd pattern or an unoptimized form with identical values:
+ // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
+ bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
+ : !Ext0->hasOneUse() || !Ext1->hasOneUse();
+ OldCost = CheapExtractCost + ScalarOpCost;
+ NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
+ } else {
+ // Handle the general case. Each extract is actually a different value:
+ // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
+ OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
+ NewCost = VectorOpCost + CheapExtractCost +
+ !Ext0->hasOneUse() * Extract0Cost +
+ !Ext1->hasOneUse() * Extract1Cost;
+ }
+
+ ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
+ if (ConvertToShuffle) {
+ if (IsBinOp && DisableBinopExtractShuffle)
+ return true;
+
+ // If we are extracting from 2 different indexes, then one operand must be
+ // shuffled before performing the vector operation. The shuffle mask is
+ // undefined except for 1 lane that is being translated to the remaining
+ // extraction lane. Therefore, it is a splat shuffle. Ex:
+ // ShufMask = { undef, undef, 0, undef }
+ // TODO: The cost model has an option for a "broadcast" shuffle
+ // (splat-from-element-0), but no option for a more general splat.
+ NewCost +=
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+
+ // Aggressively form a vector op if the cost is equal because the transform
+ // may enable further optimization.
+ // Codegen can reverse this transform (scalarize) if it was not profitable.
+ return OldCost < NewCost;
+}
+
+/// Create a shuffle that translates (shifts) 1 element from the input vector
+/// to a new element location.
+static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
+ unsigned NewIndex, IRBuilder<> &Builder) {
+ // The shuffle mask is undefined except for 1 lane that is being translated
+ // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
+ // ShufMask = { 2, undef, undef, undef }
+ auto *VecTy = cast<FixedVectorType>(Vec->getType());
+ SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
+ ShufMask[NewIndex] = OldIndex;
+ Value *Undef = UndefValue::get(VecTy);
+ return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift");
+}
+
+/// Given an extract element instruction with constant index operand, shuffle
+/// the source vector (shift the scalar element) to a NewIndex for extraction.
+/// Return null if the input can be constant folded, so that we are not creating
+/// unnecessary instructions.
+static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
+ unsigned NewIndex,
+ IRBuilder<> &Builder) {
+ // If the extract can be constant-folded, this code is unsimplified. Defer
+ // to other passes to handle that.
+ Value *X = ExtElt->getVectorOperand();
+ Value *C = ExtElt->getIndexOperand();
+ assert(isa<ConstantInt>(C) && "Expected a constant index operand");
+ if (isa<Constant>(X))
+ return nullptr;
+
+ Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
+ NewIndex, Builder);
+ return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
+}
+
+/// Try to reduce extract element costs by converting scalar compares to vector
+/// compares followed by extract.
+/// cmp (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1, Instruction &I) {
+ assert(isa<CmpInst>(&I) && "Expected a compare");
+ assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+ cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+ "Expected matching constant extract indexes");
+
+ // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
+ ++NumVecCmp;
+ CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
+ Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+ Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
+ Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
+ replaceValue(I, *NewExt);
+}
+
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+/// bo (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1, Instruction &I) {
+ assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
+ assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+ cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+ "Expected matching constant extract indexes");
+
+ // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
+ ++NumVecBO;
+ Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+ Value *VecBO =
+ Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
+
+ // All IR flags are safe to back-propagate because any potential poison
+ // created in unused vector elements is discarded by the extract.
+ if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
+ VecBOInst->copyIRFlags(&I);
+
+ Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
+ replaceValue(I, *NewExt);
+}
+
+/// Match an instruction with extracted vector operands.
+bool VectorCombine::foldExtractExtract(Instruction &I) {
+ // It is not safe to transform things like div, urem, etc. because we may
+ // create undefined behavior when executing those on unknown vector elements.
+ if (!isSafeToSpeculativelyExecute(&I))
+ return false;
+
+ Instruction *I0, *I1;
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
+ !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
+ return false;
+
+ Value *V0, *V1;
+ uint64_t C0, C1;
+ if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
+ !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
+ V0->getType() != V1->getType())
+ return false;
+
+ // If the scalar value 'I' is going to be re-inserted into a vector, then try
+ // to create an extract to that same element. The extract/insert can be
+ // reduced to a "select shuffle".
+ // TODO: If we add a larger pattern match that starts from an insert, this
+ // probably becomes unnecessary.
+ auto *Ext0 = cast<ExtractElementInst>(I0);
+ auto *Ext1 = cast<ExtractElementInst>(I1);
+ uint64_t InsertIndex = InvalidIndex;
+ if (I.hasOneUse())
+ match(I.user_back(),
+ m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
+
+ ExtractElementInst *ExtractToChange;
+ if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
+ InsertIndex))
+ return false;
+
+ if (ExtractToChange) {
+ unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
+ ExtractElementInst *NewExtract =
+ translateExtract(ExtractToChange, CheapExtractIdx, Builder);
+ if (!NewExtract)
+ return false;
+ if (ExtractToChange == Ext0)
+ Ext0 = NewExtract;
+ else
+ Ext1 = NewExtract;
+ }
+
+ if (Pred != CmpInst::BAD_ICMP_PREDICATE)
+ foldExtExtCmp(Ext0, Ext1, I);
+ else
+ foldExtExtBinop(Ext0, Ext1, I);
+
+ return true;
+}
+
+/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
+/// destination type followed by shuffle. This can enable further transforms by
+/// moving bitcasts or shuffles together.
+bool VectorCombine::foldBitcastShuf(Instruction &I) {
+ Value *V;
+ ArrayRef<int> Mask;
+ if (!match(&I, m_BitCast(
+ m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
+ return false;
+
+ // Disallow non-vector casts and length-changing shuffles.
+ // TODO: We could allow any shuffle.
+ auto *DestTy = dyn_cast<VectorType>(I.getType());
+ auto *SrcTy = cast<VectorType>(V->getType());
+ if (!DestTy || I.getOperand(0)->getType() != SrcTy)
+ return false;
+
+ // The new shuffle must not cost more than the old shuffle. The bitcast is
+ // moved ahead of the shuffle, so assume that it has the same cost as before.
+ if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
+ return false;
+
+ unsigned DestNumElts = DestTy->getNumElements();
+ unsigned SrcNumElts = SrcTy->getNumElements();
+ SmallVector<int, 16> NewMask;
+ if (SrcNumElts <= DestNumElts) {
+ // The bitcast is from wide to narrow/equal elements. The shuffle mask can
+ // always be expanded to the equivalent form choosing narrower elements.
+ assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
+ unsigned ScaleFactor = DestNumElts / SrcNumElts;
+ narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
+ } else {
+ // The bitcast is from narrow elements to wide elements. The shuffle mask
+ // must choose consecutive elements to allow casting first.
+ assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
+ unsigned ScaleFactor = SrcNumElts / DestNumElts;
+ if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
+ return false;
+ }
+ // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+ ++NumShufOfBitcast;
+ Value *CastV = Builder.CreateBitCast(V, DestTy);
+ Value *Shuf =
+ Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask);
+ replaceValue(I, *Shuf);
+ return true;
+}
+
+/// Match a vector binop or compare instruction with at least one inserted
+/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ Value *Ins0, *Ins1;
+ if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
+ !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
+ return false;
+
+ // Do not convert the vector condition of a vector select into a scalar
+ // condition. That may cause problems for codegen because of differences in
+ // boolean formats and register-file transfers.
+ // TODO: Can we account for that in the cost model?
+ bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
+ if (IsCmp)
+ for (User *U : I.users())
+ if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
+ return false;
+
+ // Match against one or both scalar values being inserted into constant
+ // vectors:
+ // vec_op VecC0, (inselt VecC1, V1, Index)
+ // vec_op (inselt VecC0, V0, Index), VecC1
+ // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
+ // TODO: Deal with mismatched index constants and variable indexes?
+ Constant *VecC0 = nullptr, *VecC1 = nullptr;
+ Value *V0 = nullptr, *V1 = nullptr;
+ uint64_t Index0 = 0, Index1 = 0;
+ if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
+ m_ConstantInt(Index0))) &&
+ !match(Ins0, m_Constant(VecC0)))
+ return false;
+ if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
+ m_ConstantInt(Index1))) &&
+ !match(Ins1, m_Constant(VecC1)))
+ return false;
+
+ bool IsConst0 = !V0;
+ bool IsConst1 = !V1;
+ if (IsConst0 && IsConst1)
+ return false;
+ if (!IsConst0 && !IsConst1 && Index0 != Index1)
+ return false;
+
+ // Bail for single insertion if it is a load.
+ // TODO: Handle this once getVectorInstrCost can cost for load/stores.
+ auto *I0 = dyn_cast_or_null<Instruction>(V0);
+ auto *I1 = dyn_cast_or_null<Instruction>(V1);
+ if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
+ (IsConst1 && I0 && I0->mayReadFromMemory()))
+ return false;
+
+ uint64_t Index = IsConst0 ? Index1 : Index0;
+ Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
+ Type *VecTy = I.getType();
+ assert(VecTy->isVectorTy() &&
+ (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
+ (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
+ ScalarTy->isPointerTy()) &&
+ "Unexpected types for insert element into binop or cmp");
+
+ unsigned Opcode = I.getOpcode();
+ int ScalarOpCost, VectorOpCost;
+ if (IsCmp) {
+ ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
+ VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+ } else {
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ }
+
+ // Get cost estimate for the insert element. This cost will factor into
+ // both sequences.
+ int InsertCost =
+ TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+ int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) +
+ VectorOpCost;
+ int NewCost = ScalarOpCost + InsertCost +
+ (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
+ (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+
+ // We want to scalarize unless the vector variant actually has lower cost.
+ if (OldCost < NewCost)
+ return false;
+
+ // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
+ // inselt NewVecC, (scalar_op V0, V1), Index
+ if (IsCmp)
+ ++NumScalarCmp;
+ else
+ ++NumScalarBO;
+
+ // For constant cases, extract the scalar element, this should constant fold.
+ if (IsConst0)
+ V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
+ if (IsConst1)
+ V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
+
+ Value *Scalar =
+ IsCmp ? Builder.CreateCmp(Pred, V0, V1)
+ : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+
+ Scalar->setName(I.getName() + ".scalar");
+
+ // All IR flags are safe to back-propagate. There is no potential for extra
+ // poison to be created by the scalar instruction.
+ if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
+ ScalarInst->copyIRFlags(&I);
+
+ // Fold the vector constants in the original vectors into a new base vector.
+ Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
+ : ConstantExpr::get(Opcode, VecC0, VecC1);
+ Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
+ replaceValue(I, *Insert);
+ return true;
+}
+
+/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
+/// a vector into vector operations followed by extract. Note: The SLP pass
+/// may miss this pattern because of implementation problems.
+bool VectorCombine::foldExtractedCmps(Instruction &I) {
+ // We are looking for a scalar binop of booleans.
+ // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
+ if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
+ return false;
+
+ // The compare predicates should match, and each compare should have a
+ // constant operand.
+ // TODO: Relax the one-use constraints.
+ Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
+ Instruction *I0, *I1;
+ Constant *C0, *C1;
+ CmpInst::Predicate P0, P1;
+ if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
+ !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
+ P0 != P1)
+ return false;
+
+ // The compare operands must be extracts of the same vector with constant
+ // extract indexes.
+ // TODO: Relax the one-use constraints.
+ Value *X;
+ uint64_t Index0, Index1;
+ if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
+ !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+ return false;
+
+ auto *Ext0 = cast<ExtractElementInst>(I0);
+ auto *Ext1 = cast<ExtractElementInst>(I1);
+ ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+ if (!ConvertToShuf)
+ return false;
+
+ // The original scalar pattern is:
+ // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
+ CmpInst::Predicate Pred = P0;
+ unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
+ : Instruction::ICmp;
+ auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
+ if (!VecTy)
+ return false;
+
+ int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+ OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+ OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+ OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+
+ // The proposed vector pattern is:
+ // vcmp = cmp Pred X, VecC
+ // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
+ int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
+ int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
+ auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
+ int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+ NewCost +=
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
+ NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+ NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+
+ // Aggressively form vector ops if the cost is equal because the transform
+ // may enable further optimization.
+ // Codegen can reverse this transform (scalarize) if it was not profitable.
+ if (OldCost < NewCost)
+ return false;
+
+ // Create a vector constant from the 2 scalar constants.
+ SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
+ UndefValue::get(VecTy->getElementType()));
+ CmpC[Index0] = C0;
+ CmpC[Index1] = C1;
+ Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
+
+ Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
+ Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
+ VCmp, Shuf);
+ Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
+ replaceValue(I, *NewExt);
+ ++NumVecCmpBO;
+ return true;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+bool VectorCombine::run() {
+ if (DisableVectorCombine)
+ return false;
+
+ bool MadeChange = false;
+ for (BasicBlock &BB : F) {
+ // Ignore unreachable basic blocks.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+ // Do not delete instructions under here and invalidate the iterator.
+ // Walk the block forwards to enable simple iterative chains of transforms.
+ // TODO: It could be more efficient to remove dead instructions
+ // iteratively in this loop rather than waiting until the end.
+ for (Instruction &I : BB) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+ Builder.SetInsertPoint(&I);
+ MadeChange |= foldExtractExtract(I);
+ MadeChange |= foldBitcastShuf(I);
+ MadeChange |= scalarizeBinopOrCmp(I);
+ MadeChange |= foldExtractedCmps(I);
+ }
+ }
+
+ // We're done with transforms, so remove dead instructions.
+ if (MadeChange)
+ for (BasicBlock &BB : F)
+ SimplifyInstructionsInBlock(&BB);
+
+ return MadeChange;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+class VectorCombineLegacyPass : public FunctionPass {
+public:
+ static char ID;
+ VectorCombineLegacyPass() : FunctionPass(ID) {
+ initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ VectorCombine Combiner(F, TTI, DT);
+ return Combiner.run();
+ }
+};
+} // namespace
+
+char VectorCombineLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
+ "Optimize scalar/vector ops", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
+ "Optimize scalar/vector ops", false, false)
+Pass *llvm::createVectorCombinePass() {
+ return new VectorCombineLegacyPass();
+}
+
+PreservedAnalyses VectorCombinePass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ VectorCombine Combiner(F, TTI, DT);
+ if (!Combiner.run())
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<AAManager>();
+ PA.preserve<BasicAA>();
+ return PA;
+}
diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 6a4f9169c2af0..0296a995ad29a 100644
--- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -21,12 +21,12 @@
using namespace llvm;
-/// initializeVectorizationPasses - Initialize all passes linked into the
-/// Vectorization library.
+/// Initialize all passes linked into the Vectorization library.
void llvm::initializeVectorization(PassRegistry &Registry) {
initializeLoopVectorizePass(Registry);
initializeSLPVectorizerPass(Registry);
initializeLoadStoreVectorizerLegacyPassPass(Registry);
+ initializeVectorCombineLegacyPassPass(Registry);
}
void LLVMInitializeVectorization(LLVMPassRegistryRef R) {