aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
commit706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
Notes
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp1011
1 files changed, 548 insertions, 463 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8f0bf70f873c..684a3098e564 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -58,8 +58,8 @@
#include "VPRecipeBuilder.h"
#include "VPlan.h"
#include "VPlanHCFGBuilder.h"
-#include "VPlanHCFGTransforms.h"
#include "VPlanPredicator.h"
+#include "VPlanTransforms.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
@@ -124,6 +124,7 @@
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
@@ -149,7 +150,6 @@
#include <string>
#include <tuple>
#include <utility>
-#include <vector>
using namespace llvm;
@@ -200,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
-/// We don't interleave loops with a known constant trip count below this
-/// number.
-static const unsigned TinyTripCountInterleaveThreshold = 128;
+static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
+ "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
+ cl::desc("We don't interleave loops with a estimated constant trip count "
+ "below this number"));
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
@@ -427,6 +428,11 @@ public:
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;
+ /// Vectorize a single GetElementPtrInst based on information gathered and
+ /// decisions taken during planning.
+ void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
+ bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
+
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
@@ -476,15 +482,20 @@ public:
/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
- /// Try to vectorize the interleaved access group that \p Instr belongs to,
- /// optionally masking the vector operations if \p BlockInMask is non-null.
- void vectorizeInterleaveGroup(Instruction *Instr,
- VectorParts *BlockInMask = nullptr);
-
- /// Vectorize Load and Store instructions, optionally masking the vector
- /// operations if \p BlockInMask is non-null.
- void vectorizeMemoryInstruction(Instruction *Instr,
- VectorParts *BlockInMask = nullptr);
+ /// Try to vectorize the interleaved access group that \p Instr belongs to
+ /// with the base address given in \p Addr, optionally masking the vector
+ /// operations if \p BlockInMask is non-null. Use \p State to translate given
+ /// VPValues to IR values in the vectorized loop.
+ void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
+ VPValue *Addr, VPValue *BlockInMask = nullptr);
+
+ /// Vectorize Load and Store instructions with the base address given in \p
+ /// Addr, optionally masking the vector operations if \p BlockInMask is
+ /// non-null. Use \p State to translate given VPValues to IR values in the
+ /// vectorized loop.
+ void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
+ VPValue *Addr,
+ VPValue *BlockInMask = nullptr);
/// Set the debug location in the builder using the debug location in
/// the instruction.
@@ -525,6 +536,9 @@ protected:
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);
+ /// Clear NSW/NUW flags from reduction instructions if necessary.
+ void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
+
/// The Loop exit block may have single value PHI nodes with some
/// incoming value. While vectorizing we only handled real values
/// that were defined inside the loop and we should have one value for
@@ -539,10 +553,6 @@ protected:
/// represented as.
void truncateToMinimalBitwidths();
- /// Insert the new loop to the loop hierarchy and pass manager
- /// and update the analysis passes.
- void updateAnalysis();
-
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -1204,14 +1214,14 @@ public:
/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType) {
- return TTI.isLegalMaskedScatter(DataType);
+ bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+ return TTI.isLegalMaskedScatter(DataType, Alignment);
}
/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType) {
- return TTI.isLegalMaskedGather(DataType);
+ bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
+ return TTI.isLegalMaskedGather(DataType, Alignment);
}
/// Returns true if the target machine can represent \p V as a masked gather
@@ -1222,7 +1232,9 @@ public:
if (!LI && !SI)
return false;
auto *Ty = getMemInstValueType(V);
- return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+ MaybeAlign Align = getLoadStoreAlignment(V);
+ return (LI && isLegalMaskedGather(Ty, Align)) ||
+ (SI && isLegalMaskedScatter(Ty, Align));
}
/// Returns true if \p I is an instruction that will be scalarized with
@@ -2155,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
- VectorParts *BlockInMask) {
+ VPTransformState &State,
+ VPValue *Addr,
+ VPValue *BlockInMask) {
const InterleaveGroup<Instruction> *Group =
Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.");
@@ -2165,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
return;
const DataLayout &DL = Instr->getModule()->getDataLayout();
- Value *Ptr = getLoadStorePointerOperand(Instr);
// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
- Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
// Prepare for the new pointers.
- setDebugLocFromInst(Builder, Ptr);
- SmallVector<Value *, 2> NewPtrs;
+ SmallVector<Value *, 2> AddrParts;
unsigned Index = Group->getIndex(Instr);
- VectorParts Mask;
- bool IsMaskForCondRequired = BlockInMask;
- if (IsMaskForCondRequired) {
- Mask = *BlockInMask;
- // TODO: extend the masked interleaved-group support to reversed access.
- assert(!Group->isReverse() && "Reversed masked interleave-group "
- "not supported.");
- }
+ // TODO: extend the masked interleaved-group support to reversed access.
+ assert((!BlockInMask || !Group->isReverse()) &&
+ "Reversed masked interleave-group not supported.");
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
@@ -2196,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
if (Group->isReverse())
Index += (VF - 1) * Group->getFactor();
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = gep->isInBounds();
-
for (unsigned Part = 0; Part < UF; Part++) {
- Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
+ Value *AddrPart = State.get(Addr, {Part, 0});
+ setDebugLocFromInst(Builder, AddrPart);
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
@@ -2214,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
// A[i] = b; // Member of index 0
// A[i+2] = c; // Member of index 2 (Current instruction)
// Current pointer is pointed to A[i+2], adjust it to A[i].
- NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
- if (InBounds)
- cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
+ cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
// Cast to the vector pointer type.
- NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
+ unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
+ Type *PtrTy = VecTy->getPointerTo(AddressSpace);
+ AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
}
setDebugLocFromInst(Builder, Instr);
@@ -2237,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
Instruction *NewLoad;
- if (IsMaskForCondRequired || MaskForGaps) {
+ if (BlockInMask || MaskForGaps) {
assert(useMaskedInterleavedAccesses(*TTI) &&
"masked interleaved groups are not allowed.");
Value *GroupMask = MaskForGaps;
- if (IsMaskForCondRequired) {
- auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ if (BlockInMask) {
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
Value *ShuffledMask = Builder.CreateShuffleVector(
- Mask[Part], Undefs, RepMask, "interleaved.mask");
+ BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
GroupMask = MaskForGaps
? Builder.CreateBinOp(Instruction::And, ShuffledMask,
MaskForGaps)
: ShuffledMask;
}
NewLoad =
- Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+ Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
GroupMask, UndefVec, "wide.masked.vec");
}
else
- NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
+ NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
Group->getAlignment(), "wide.vec");
Group->addMetadata(NewLoad);
NewLoads.push_back(NewLoad);
@@ -2325,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
"interleaved.vec");
Instruction *NewStoreInstr;
- if (IsMaskForCondRequired) {
- auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ if (BlockInMask) {
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
Value *ShuffledMask = Builder.CreateShuffleVector(
- Mask[Part], Undefs, RepMask, "interleaved.mask");
+ BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
NewStoreInstr = Builder.CreateMaskedStore(
- IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+ IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
}
else
- NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
- Group->getAlignment());
+ NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
+ Group->getAlignment());
Group->addMetadata(NewStoreInstr);
}
}
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
- VectorParts *BlockInMask) {
+ VPTransformState &State,
+ VPValue *Addr,
+ VPValue *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2354,17 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point");
if (Decision == LoopVectorizationCostModel::CM_Interleave)
- return vectorizeInterleaveGroup(Instr);
+ return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
- Value *Ptr = getLoadStorePointerOperand(Instr);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
const DataLayout &DL = Instr->getModule()->getDataLayout();
const Align Alignment =
DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
- unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
@@ -2378,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
// gather/scatter. Otherwise Decision should have been to Scalarize.
assert((ConsecutiveStride || CreateGatherScatter) &&
"The instruction should be scalarized");
+ (void)ConsecutiveStride;
- // Handle consecutive loads/stores.
- if (ConsecutiveStride)
- Ptr = getOrCreateScalarValue(Ptr, {0, 0});
-
- VectorParts Mask;
+ VectorParts BlockInMaskParts(UF);
bool isMaskRequired = BlockInMask;
if (isMaskRequired)
- Mask = *BlockInMask;
-
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(
- getLoadStorePointerOperand(Instr)->stripPointerCasts()))
- InBounds = gep->isInBounds();
+ for (unsigned Part = 0; Part < UF; ++Part)
+ BlockInMaskParts[Part] = State.get(BlockInMask, Part);
const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
// Calculate the pointer for the specific unroll-part.
GetElementPtrInst *PartPtr = nullptr;
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
if (Reverse) {
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
@@ -2407,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
PartPtr->setIsInBounds(InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- Mask[Part] = reverseVector(Mask[Part]);
+ BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
} else {
PartPtr = cast<GetElementPtrInst>(
Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
PartPtr->setIsInBounds(InBounds);
}
+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
};
@@ -2425,8 +2433,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
Instruction *NewSI = nullptr;
Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
- Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(Addr, Part);
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
Alignment.value(), MaskPart);
} else {
@@ -2437,10 +2445,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
// We don't want to update the value in the map as it might be used in
// another expression. So don't call resetVectorValue(StoredVal).
}
- auto *VecPtr = CreateVecPtr(Part, Ptr);
+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
- Alignment.value(), Mask[Part]);
+ NewSI = Builder.CreateMaskedStore(
+ StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
else
NewSI =
Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
@@ -2456,17 +2464,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
for (unsigned Part = 0; Part < UF; ++Part) {
Value *NewLI;
if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
- Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(Addr, Part);
NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
nullptr, "wide.masked.gather");
addMetadata(NewLI, LI);
} else {
- auto *VecPtr = CreateVecPtr(Part, Ptr);
+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
if (isMaskRequired)
- NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
- UndefValue::get(DataTy),
- "wide.masked.load");
+ NewLI = Builder.CreateMaskedLoad(
+ VecPtr, Alignment.value(), BlockInMaskParts[Part],
+ UndefValue::get(DataTy), "wide.masked.load");
else
NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
"wide.load");
@@ -2676,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(L);
- BasicBlock *BB = L->getLoopPreheader();
- IRBuilder<> Builder(BB->getTerminator());
+ // Reuse existing vector loop preheader for TC checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ IRBuilder<> Builder(TCCheckBlock->getTerminator());
// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
@@ -2694,48 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
P, Count, ConstantInt::get(Count->getType(), VF * UF),
"min.iters.check");
- BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
- // Update dominator tree immediately if the generated block is a
- // LoopBypassBlock because SCEV expansions to generate loop bypass
- // checks may query it before the current function is finished.
- DT->addNewBlock(NewBB, BB);
- if (L->getParentLoop())
- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
- ReplaceInstWithInst(BB->getTerminator(),
- BranchInst::Create(Bypass, NewBB, CheckMinIters));
- LoopBypassBlocks.push_back(BB);
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader =
+ SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
+ "vector.ph");
+
+ assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+ DT->getNode(Bypass)->getIDom()) &&
+ "TC check is expected to dominate Bypass");
+
+ // Update dominator for Bypass & LoopExit.
+ DT->changeImmediateDominator(Bypass, TCCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+ ReplaceInstWithInst(
+ TCCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ LoopBypassBlocks.push_back(TCCheckBlock);
}
void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
- BasicBlock *BB = L->getLoopPreheader();
+ // Reuse existing vector loop preheader for SCEV checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
// Generate the code to check that the SCEV assumptions that we made.
// We want the new basic block to start at the first instruction in a
// sequence of instructions that form a check.
SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
"scev.check");
- Value *SCEVCheck =
- Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
+ Value *SCEVCheck = Exp.expandCodeForPredicate(
+ &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
if (C->isZero())
return;
- assert(!BB->getParent()->hasOptSize() &&
+ assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
"Cannot SCEV check stride or overflow when optimizing for size");
- // Create a new block containing the stride check.
- BB->setName("vector.scevcheck");
- auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
- // Update dominator tree immediately if the generated block is a
- // LoopBypassBlock because SCEV expansions to generate loop bypass
- // checks may query it before the current function is finished.
- DT->addNewBlock(NewBB, BB);
- if (L->getParentLoop())
- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
- ReplaceInstWithInst(BB->getTerminator(),
- BranchInst::Create(Bypass, NewBB, SCEVCheck));
- LoopBypassBlocks.push_back(BB);
+ SCEVCheckBlock->setName("vector.scevcheck");
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader =
+ SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
+ nullptr, "vector.ph");
+
+ // Update dominator only if this is first RT check.
+ if (LoopBypassBlocks.empty()) {
+ DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
+ }
+
+ ReplaceInstWithInst(
+ SCEVCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
+ LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
}
@@ -2744,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
if (EnableVPlanNativePath)
return;
- BasicBlock *BB = L->getLoopPreheader();
+ // Reuse existing vector loop preheader for runtime memory checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const MemCheckBlock = L->getLoopPreheader();
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
@@ -2752,11 +2777,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
Instruction *FirstCheckInst;
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
- Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
+ Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
if (!MemRuntimeCheck)
return;
- if (BB->getParent()->hasOptSize()) {
+ if (MemCheckBlock->getParent()->hasOptSize()) {
assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
"Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.");
@@ -2770,24 +2795,28 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
});
}
- // Create a new block containing the memory check.
- BB->setName("vector.memcheck");
- auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
- // Update dominator tree immediately if the generated block is a
- // LoopBypassBlock because SCEV expansions to generate loop bypass
- // checks may query it before the current function is finished.
- DT->addNewBlock(NewBB, BB);
- if (L->getParentLoop())
- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
- ReplaceInstWithInst(BB->getTerminator(),
- BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
- LoopBypassBlocks.push_back(BB);
+ MemCheckBlock->setName("vector.memcheck");
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader =
+ SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
+ "vector.ph");
+
+ // Update dominator only if this is first RT check.
+ if (LoopBypassBlocks.empty()) {
+ DT->changeImmediateDominator(Bypass, MemCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
+ }
+
+ ReplaceInstWithInst(
+ MemCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
+ LoopBypassBlocks.push_back(MemCheckBlock);
AddedSafetyChecks = true;
// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
- PSE.getSE());
+ PSE.getSE());
LVer->prepareNoAliasMetadata();
}
@@ -2912,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
...
*/
- BasicBlock *OldBasicBlock = OrigLoop->getHeader();
- BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
- BasicBlock *ExitBlock = OrigLoop->getExitBlock();
MDNode *OrigLoopID = OrigLoop->getLoopID();
- assert(VectorPH && "Invalid loop structure");
- assert(ExitBlock && "Must have an exit block");
// Some loops have a single integer induction variable, while other loops
// don't. One example is c++ iterators that often have multiple pointer
@@ -2934,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
Type *IdxTy = Legal->getWidestInductionType();
// Split the single block loop into the two loop structure described above.
- BasicBlock *VecBody =
- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
- BasicBlock *MiddleBlock =
- VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
- BasicBlock *ScalarPH =
- MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+ LoopScalarBody = OrigLoop->getHeader();
+ LoopVectorPreHeader = OrigLoop->getLoopPreheader();
+ LoopExitBlock = OrigLoop->getExitBlock();
+ assert(LoopExitBlock && "Must have an exit block");
+ assert(LoopVectorPreHeader && "Invalid loop structure");
+
+ LoopMiddleBlock =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+ LI, nullptr, "middle.block");
+ LoopScalarPreHeader =
+ SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
+ nullptr, "scalar.ph");
+ // We intentionally don't let SplitBlock to update LoopInfo since
+ // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
+ // LoopVectorBody is explicitly added to the correct place few lines later.
+ LoopVectorBody =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+ nullptr, nullptr, "vector.body");
+
+ // Update dominator for loop exit.
+ DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
// Create and register the new vector loop.
Loop *Lp = LI->AllocateLoop();
@@ -2949,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// before calling any utilities such as SCEV that require valid LoopInfo.
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
- ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
- ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
} else {
LI->addTopLevelLoop(Lp);
}
- Lp->addBasicBlockToLoop(VecBody, *LI);
+ Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
// Find the loop boundaries.
Value *Count = getOrCreateTripCount(Lp);
@@ -2966,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// backedge-taken count is uint##_max: adding one to it will overflow leading
// to an incorrect trip count of zero. In this (rare) case we will also jump
// to the scalar loop.
- emitMinimumIterationCountCheck(Lp, ScalarPH);
+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
// Generate the code to check any assumptions that we've made for SCEV
// expressions.
- emitSCEVChecks(Lp, ScalarPH);
+ emitSCEVChecks(Lp, LoopScalarPreHeader);
// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
- emitMemRuntimeChecks(Lp, ScalarPH);
+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
// Generate the induction variable.
// The loop step is equal to the vectorization factor (num of SIMD elements)
@@ -3003,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
InductionDescriptor II = InductionEntry.second;
// Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal = PHINode::Create(
- OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
+ PHINode *BCResumeVal =
+ PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+ LoopScalarPreHeader->getTerminator());
// Copy original phi DL over to the new one.
BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
Value *&EndValue = IVEndValues[OrigPhi];
@@ -3015,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
Type *StepType = II.getStep()->getType();
Instruction::CastOps CastOp =
- CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
+ CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
- const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+ const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
EndValue->setName("ind.end");
}
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, MiddleBlock);
+ BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
// Fix the scalar body counter (PHI node).
// The old induction's phi node in the scalar body needs the truncated
// value.
for (BasicBlock *BB : LoopBypassBlocks)
BCResumeVal->addIncoming(II.getStartValue(), BB);
- OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
+ OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}
// We need the OrigLoop (scalar loop part) latch terminator to help
@@ -3049,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
// If tail is to be folded, we know we don't need to run the remainder.
Value *CmpN = Builder.getTrue();
if (!Cost->foldTailByMasking()) {
- CmpN =
- CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
- CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+ CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+ CountRoundDown, "cmp.n",
+ LoopMiddleBlock->getTerminator());
// Here we use the same DebugLoc as the scalar loop latch branch instead
// of the corresponding compare because they may have ended up with
@@ -3060,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
}
- BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
+ BranchInst *BrInst =
+ BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
- ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
+ ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
// Get ready to start creating new instructions into the vectorized body.
- Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
-
- // Save the state.
- LoopVectorPreHeader = Lp->getLoopPreheader();
- LoopScalarPreHeader = ScalarPH;
- LoopMiddleBlock = MiddleBlock;
- LoopExitBlock = ExitBlock;
- LoopVectorBody = VecBody;
- LoopScalarBody = OldBasicBlock;
+ assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
+ "Inconsistent vector loop preheader");
+ Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
Optional<MDNode *> VectorizedLoopID =
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
@@ -3094,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();
+#ifdef EXPENSIVE_CHECKS
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+ LI->verify(*DT);
+#endif
+
return LoopVectorPreHeader;
}
@@ -3429,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
// This is the second stage of vectorizing recurrences.
fixCrossIterationPHIs();
- // Update the dominator tree.
- //
- // FIXME: After creating the structure of the new loop, the dominator tree is
- // no longer up-to-date, and it remains that way until we update it
- // here. An out-of-date dominator tree is problematic for SCEV,
- // because SCEVExpander uses it to guide code generation. The
- // vectorizer use SCEVExpanders in several places. Instead, we should
- // keep the dominator tree up-to-date as we go.
- updateAnalysis();
+ // Forget the original basic block.
+ PSE.getSE()->forgetLoop(OrigLoop);
// Fix-up external users of the induction variables.
for (auto &Entry : *Legal->getInductionVars())
@@ -3550,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// among all unrolled iterations, due to the order of their construction.
Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
- // Set the insertion point after the previous value if it is an instruction.
+ // Find and set the insertion point after the previous value if it is an
+ // instruction.
+ BasicBlock::iterator InsertPt;
// Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop. Also, if the previous
- // value is a phi node, we should insert after all the phi nodes to avoid
- // breaking basic block verification.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
- isa<PHINode>(PreviousLastPart))
- Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
- else
- Builder.SetInsertPoint(
- &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
+ // guaranteed to be an instruction in the vector loop.
+ // FIXME: Loop invariant values do not form recurrences. We should deal with
+ // them earlier.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
+ InsertPt = LoopVectorBody->getFirstInsertionPt();
+ else {
+ Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
+ if (isa<PHINode>(PreviousLastPart))
+ // If the previous value is a phi node, we should insert after all the phi
+ // nodes in the block containing the PHI to avoid breaking basic block
+ // verification. Note that the basic block may be different to
+ // LoopVectorBody, in case we predicate the loop.
+ InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
+ else
+ InsertPt = ++PreviousInst->getIterator();
+ }
+ Builder.SetInsertPoint(&*InsertPt);
// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
@@ -3693,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
}
+ // Wrap flags are in general invalid after vectorization, clear them.
+ clearReductionWrapFlags(RdxDesc);
+
// Fix the vector-loop phi.
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+
for (unsigned Part = 0; Part < UF; ++Part) {
Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
Value *Val = getOrCreateVectorValue(LoopVal, Part);
- // Make sure to add the reduction stat value only to the
+ // Make sure to add the reduction start value only to the
// first unroll part.
Value *StartVal = (Part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
@@ -3839,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
+void InnerLoopVectorizer::clearReductionWrapFlags(
+ RecurrenceDescriptor &RdxDesc) {
+ RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+ if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
+ RK != RecurrenceDescriptor::RK_IntegerMult)
+ return;
+
+ Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
+ assert(LoopExitInstr && "null loop exit instruction");
+ SmallVector<Instruction *, 8> Worklist;
+ SmallPtrSet<Instruction *, 8> Visited;
+ Worklist.push_back(LoopExitInstr);
+ Visited.insert(LoopExitInstr);
+
+ while (!Worklist.empty()) {
+ Instruction *Cur = Worklist.pop_back_val();
+ if (isa<OverflowingBinaryOperator>(Cur))
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *V = getOrCreateVectorValue(Cur, Part);
+ cast<Instruction>(V)->dropPoisonGeneratingFlags();
+ }
+
+ for (User *U : Cur->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
+ Visited.insert(UI).second)
+ Worklist.push_back(UI);
+ }
+ }
+}
+
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getNumIncomingValues() == 1) {
@@ -3960,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
}
}
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
+ unsigned VF, bool IsPtrLoopInvariant,
+ SmallBitVector &IsIndexLoopInvariant) {
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+
+ if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+ VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
+ addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr = IsPtrLoopInvariant
+ ? GEP->getPointerOperand()
+ : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (auto Index : enumerate(GEP->indices())) {
+ Value *User = Index.value().get();
+ if (IsIndexLoopInvariant[Index.index()])
+ Indices.push_back(User);
+ else
+ Indices.push_back(getOrCreateVectorValue(User, Part));
+ }
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP =
+ GEP->isInBounds()
+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
+ Indices)
+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
+ assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+ "NewGEP is not a pointer vector");
+ VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
+ addMetadata(NewGEP, GEP);
+ }
+ }
+}
+
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
unsigned VF) {
PHINode *P = cast<PHINode>(PN);
@@ -4062,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
switch (I.getOpcode()) {
case Instruction::Br:
case Instruction::PHI:
+ case Instruction::GetElementPtr:
llvm_unreachable("This instruction is handled by a different recipe.");
- case Instruction::GetElementPtr: {
- // Construct a vector GEP by widening the operands of the scalar GEP as
- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
- // results in a vector of pointers when at least one operand of the GEP
- // is vector-typed. Thus, to keep the representation compact, we only use
- // vector-typed operands for loop-varying values.
- auto *GEP = cast<GetElementPtrInst>(&I);
-
- if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
- // If we are vectorizing, but the GEP has only loop-invariant operands,
- // the GEP we build (by only using vector-typed operands for
- // loop-varying values) would be a scalar pointer. Thus, to ensure we
- // produce a vector of pointers, we need to either arbitrarily pick an
- // operand to broadcast, or broadcast a clone of the original GEP.
- // Here, we broadcast a clone of the original.
- //
- // TODO: If at some point we decide to scalarize instructions having
- // loop-invariant operands, this special case will no longer be
- // required. We would add the scalarization decision to
- // collectLoopScalars() and teach getVectorValue() to broadcast
- // the lane-zero scalar value.
- auto *Clone = Builder.Insert(GEP->clone());
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
- VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
- addMetadata(EntryPart, GEP);
- }
- } else {
- // If the GEP has at least one loop-varying operand, we are sure to
- // produce a vector of pointers. But if we are only unrolling, we want
- // to produce a scalar GEP for each unroll part. Thus, the GEP we
- // produce with the code below will be scalar (if VF == 1) or vector
- // (otherwise). Note that for the unroll-only case, we still maintain
- // values in the vector mapping with initVector, as we do for other
- // instructions.
- for (unsigned Part = 0; Part < UF; ++Part) {
- // The pointer operand of the new GEP. If it's loop-invariant, we
- // won't broadcast it.
- auto *Ptr =
- OrigLoop->isLoopInvariant(GEP->getPointerOperand())
- ? GEP->getPointerOperand()
- : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
-
- // Collect all the indices for the new GEP. If any index is
- // loop-invariant, we won't broadcast it.
- SmallVector<Value *, 4> Indices;
- for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
- if (OrigLoop->isLoopInvariant(U.get()))
- Indices.push_back(U.get());
- else
- Indices.push_back(getOrCreateVectorValue(U.get(), Part));
- }
-
- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
- // but it should be a vector, otherwise.
- auto *NewGEP =
- GEP->isInBounds()
- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
- Indices)
- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
- assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
- "NewGEP is not a pointer vector");
- VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
- addMetadata(NewGEP, GEP);
- }
- }
-
- break;
- }
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
@@ -4335,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
} // end of switch.
}
-void InnerLoopVectorizer::updateAnalysis() {
- // Forget the original basic block.
- PSE.getSE()->forgetLoop(OrigLoop);
-
- // DT is not kept up-to-date for outer loop vectorization
- if (EnableVPlanNativePath)
- return;
-
- // Update the dominator tree information.
- assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
- "Entry does not dominate exit.");
-
- DT->addNewBlock(LoopMiddleBlock,
- LI->getLoopFor(LoopVectorBody)->getLoopLatch());
- DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
- DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
- DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-}
-
void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
@@ -4562,9 +4619,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
return WideningDecision == CM_Scalarize;
}
const MaybeAlign Alignment = getLoadStoreAlignment(I);
- return isa<LoadInst>(I) ?
- !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
- : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
+ return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
+ isLegalMaskedGather(Ty, Alignment))
+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
+ isLegalMaskedScatter(Ty, Alignment));
}
case Instruction::UDiv:
case Instruction::SDiv:
@@ -4667,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
SetVector<Instruction *> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();
+ // Instructions that are scalar with predication must not be considered
+ // uniform after vectorization, because that would create an erroneous
+ // replicating region where only a single instance out of VF should be formed.
+ // TODO: optimize such seldom cases if found important, see PR40816.
+ auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+ if (isScalarWithPredication(I, VF)) {
+ LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
+ << *I << "\n");
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
+ Worklist.insert(I);
+ };
+
// Start with the conditional branch. If the branch condition is an
// instruction contained in the loop that is only used by the branch, it is
// uniform.
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
- Worklist.insert(Cmp);
- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
- }
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ addToWorklistIfAllowed(Cmp);
// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
// are pointers that are treated like consecutive pointers during
@@ -4733,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// Add to the Worklist all consecutive and consecutive-like pointers that
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
- if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
- Worklist.insert(V);
- }
+ if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
+ addToWorklistIfAllowed(V);
// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be already inside Worklist. It ensures
@@ -4762,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
return Worklist.count(J) ||
(OI == getLoadStorePointerOperand(J) &&
isUniformDecision(J, VF));
- })) {
- Worklist.insert(OI);
- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
- }
+ }))
+ addToWorklistIfAllowed(OI);
}
}
@@ -4807,11 +4873,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
continue;
// The induction variable and its update instruction will remain uniform.
- Worklist.insert(Ind);
- Worklist.insert(IndUpdate);
- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
- << "\n");
+ addToWorklistIfAllowed(Ind);
+ addToWorklistIfAllowed(IndUpdate);
}
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
@@ -5143,9 +5206,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;
- // Do not interleave loops with a relatively small trip count.
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
+ // Do not interleave loops with a relatively small known or estimated trip
+ // count.
+ auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
+ if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
return 1;
RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5208,12 +5272,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
- // If the trip count is constant, limit the interleave count to be less than
- // the trip count divided by VF.
- if (TC > 0) {
- assert(TC >= VF && "VF exceeds trip count?");
- if ((TC / VF) < MaxInterleaveCount)
- MaxInterleaveCount = (TC / VF);
+ // If trip count is known or estimated compile time constant, limit the
+ // interleave count to be less than the trip count divided by VF.
+ if (BestKnownTC) {
+ MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
}
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5746,7 +5808,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// vectorized loop where the user of it is a vectorized instruction.
const MaybeAlign Alignment = getLoadStoreAlignment(I);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
- Alignment ? Alignment->value() : 0, AS);
+ Alignment, AS);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5783,8 +5845,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
Alignment ? Alignment->value() : 0, AS);
else
- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
- Alignment ? Alignment->value() : 0, AS, I);
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
bool Reverse = ConsecutiveStride < 0;
if (Reverse)
@@ -5800,16 +5861,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Load, ValTy,
- Alignment ? Alignment->value() : 0, AS) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
}
StoreInst *SI = cast<StoreInst>(I);
bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Store, ValTy,
- Alignment ? Alignment->value() : 0, AS) +
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
(isLoopInvariantStoreValue
? 0
: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
@@ -5877,8 +5936,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
unsigned AS = getLoadStoreAddressSpace(I);
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy,
- Alignment ? Alignment->value() : 0, AS, I);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
}
return getWideningCost(I, VF);
}
@@ -6217,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
return N * TTI.getArithmeticInstrCost(
I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
@@ -6225,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
- I->getOperand(0));
+ I->getOperand(0), I);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
@@ -6714,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
return BlockMaskCache[BB] = BlockMask;
}
-VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
- VFRange &Range,
- VPlanPtr &Plan) {
- const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
- if (!IG)
- return nullptr;
-
- // Now check if IG is relevant for VF's in the given range.
- auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
- return [=](unsigned VF) -> bool {
- return (VF >= 2 && // Query is illegal for VF == 1
- CM.getWideningDecision(I, VF) ==
- LoopVectorizationCostModel::CM_Interleave);
- };
- };
- if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
- return nullptr;
-
- // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
- // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
- // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
- assert(I == IG->getInsertPos() &&
- "Generating a recipe for an adjunct member of an interleave group");
-
- VPValue *Mask = nullptr;
- if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), Plan);
-
- return new VPInterleaveRecipe(IG, Mask);
-}
-
VPWidenMemoryInstructionRecipe *
VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
VPlanPtr &Plan) {
@@ -6754,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
auto willWiden = [&](unsigned VF) -> bool {
if (VF == 1)
return false;
- if (CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF))
- return false;
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, VF);
assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
"CM decision should be taken at this point.");
- assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
- "Interleave memory opportunity should be caught earlier.");
+ if (Decision == LoopVectorizationCostModel::CM_Interleave)
+ return true;
+ if (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF))
+ return false;
return Decision != LoopVectorizationCostModel::CM_Scalarize;
};
@@ -6773,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
if (Legal->isMaskRequired(I))
Mask = createBlockInMask(I->getParent(), Plan);
- return new VPWidenMemoryInstructionRecipe(*I, Mask);
+ VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
+ return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
}
VPWidenIntOrFpInductionRecipe *
@@ -6861,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
case Instruction::FPTrunc:
case Instruction::FRem:
case Instruction::FSub:
- case Instruction::GetElementPtr:
case Instruction::ICmp:
case Instruction::IntToPtr:
case Instruction::Load:
@@ -6926,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
return false;
+ // If this ingredient's recipe is to be recorded, keep its recipe a singleton
+ // to avoid having to split recipes later.
+ bool IsSingleton = Ingredient2Recipe.count(I);
+
+ // Success: widen this instruction.
- // Success: widen this instruction. We optimize the common case where
+ // Use the default widening recipe. We optimize the common case where
// consecutive instructions can be represented by a single recipe.
- if (!VPBB->empty()) {
- VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
- if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
- return true;
- }
+ if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
+ LastExtensibleRecipe->appendInstruction(I))
+ return true;
- VPBB->appendRecipe(new VPWidenRecipe(I));
+ VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
+ if (!IsSingleton)
+ LastExtensibleRecipe = WidenRecipe;
+ setRecipe(I, WidenRecipe);
+ VPBB->appendRecipe(WidenRecipe);
return true;
}
@@ -6951,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
[&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+ setRecipe(I, Recipe);
// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
@@ -7009,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
VPlanPtr &Plan, VPBasicBlock *VPBB) {
VPRecipeBase *Recipe = nullptr;
- // Check if Instr should belong to an interleave memory recipe, or already
- // does. In the latter case Instr is irrelevant.
- if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
- VPBB->appendRecipe(Recipe);
- return true;
- }
- // Check if Instr is a memory operation that should be widened.
- if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
+ // First, check for specific widening recipes that deal with memory
+ // operations, inductions and Phi nodes.
+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
+ (Recipe = tryToOptimizeInduction(Instr, Range)) ||
+ (Recipe = tryToBlend(Instr, Plan)) ||
+ (isa<PHINode>(Instr) &&
+ (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
+ setRecipe(Instr, Recipe);
VPBB->appendRecipe(Recipe);
return true;
}
- // Check if Instr should form some PHI recipe.
- if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
- VPBB->appendRecipe(Recipe);
- return true;
- }
- if ((Recipe = tryToBlend(Instr, Plan))) {
+ // Handle GEP widening.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+ auto Scalarize = [&](unsigned VF) {
+ return CM.isScalarWithPredication(Instr, VF) ||
+ CM.isScalarAfterVectorization(Instr, VF) ||
+ CM.isProfitableToScalarize(Instr, VF);
+ };
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
+ return false;
+ VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
+ setRecipe(Instr, Recipe);
VPBB->appendRecipe(Recipe);
return true;
}
- if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
- VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
- return true;
- }
// Check if Instr is to be widened by a general VPWidenRecipe, after
- // having first checked for specific widening recipes that deal with
- // Interleave Groups, Inductions and Phi nodes.
+ // having first checked for specific widening recipes.
if (tryToWiden(Instr, VPBB, Range))
return true;
@@ -7094,19 +7129,57 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
// Hold a mapping from predicated instructions to their recipes, in order to
// fix their AlsoPack behavior if a user is determined to replicate and use a
// scalar instead of vector value.
DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
+ SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
+
+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
+
+ // ---------------------------------------------------------------------------
+ // Pre-construction: record ingredients whose recipes we'll need to further
+ // process after constructing the initial VPlan.
+ // ---------------------------------------------------------------------------
+
+ // Mark instructions we'll need to sink later and their targets as
+ // ingredients whose recipe we'll need to record.
+ for (auto &Entry : SinkAfter) {
+ RecipeBuilder.recordRecipeOf(Entry.first);
+ RecipeBuilder.recordRecipeOf(Entry.second);
+ }
+
+ // For each interleave group which is relevant for this (possibly trimmed)
+ // Range, add it to the set of groups to be later applied to the VPlan and add
+ // placeholders for its members' Recipes which we'll be replacing with a
+ // single VPInterleaveRecipe.
+ for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
+ auto applyIG = [IG, this](unsigned VF) -> bool {
+ return (VF >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(IG->getInsertPos(), VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ };
+ if (!getDecisionAndClampRange(applyIG, Range))
+ continue;
+ InterleaveGroups.insert(IG);
+ for (unsigned i = 0; i < IG->getFactor(); i++)
+ if (Instruction *Member = IG->getMember(i))
+ RecipeBuilder.recordRecipeOf(Member);
+ };
+
+ // ---------------------------------------------------------------------------
+ // Build initial VPlan: Scan the body of the loop in a topological order to
+ // visit each basic block after having visited its predecessor basic blocks.
+ // ---------------------------------------------------------------------------
// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
auto Plan = std::make_unique<VPlan>(VPBB);
- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
// Represent values that will have defs inside VPlan.
for (Value *V : NeedDef)
Plan->addVPValue(V);
@@ -7125,10 +7198,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBB = FirstVPBBForBB;
Builder.setInsertPoint(VPBB);
- std::vector<Instruction *> Ingredients;
-
- // Organize the ingredients to vectorize from current basic block in the
- // right order.
+ // Introduce each ingredient into VPlan.
for (Instruction &I : BB->instructionsWithoutDebug()) {
Instruction *Instr = &I;
@@ -7138,43 +7208,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
DeadInstructions.find(Instr) != DeadInstructions.end())
continue;
- // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
- // member of the IG, do not construct any Recipe for it.
- const InterleaveGroup<Instruction> *IG =
- CM.getInterleavedAccessGroup(Instr);
- if (IG && Instr != IG->getInsertPos() &&
- Range.Start >= 2 && // Query is illegal for VF == 1
- CM.getWideningDecision(Instr, Range.Start) ==
- LoopVectorizationCostModel::CM_Interleave) {
- auto SinkCandidate = SinkAfterInverse.find(Instr);
- if (SinkCandidate != SinkAfterInverse.end())
- Ingredients.push_back(SinkCandidate->second);
- continue;
- }
-
- // Move instructions to handle first-order recurrences, step 1: avoid
- // handling this instruction until after we've handled the instruction it
- // should follow.
- auto SAIt = SinkAfter.find(Instr);
- if (SAIt != SinkAfter.end()) {
- LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
- << *SAIt->second
- << " to vectorize a 1st order recurrence.\n");
- SinkAfterInverse[SAIt->second] = Instr;
- continue;
- }
-
- Ingredients.push_back(Instr);
-
- // Move instructions to handle first-order recurrences, step 2: push the
- // instruction to be sunk at its insertion point.
- auto SAInvIt = SinkAfterInverse.find(Instr);
- if (SAInvIt != SinkAfterInverse.end())
- Ingredients.push_back(SAInvIt->second);
- }
-
- // Introduce each ingredient into VPlan.
- for (Instruction *Instr : Ingredients) {
if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
continue;
@@ -7199,6 +7232,33 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VPBlockUtils::disconnectBlocks(PreEntry, Entry);
delete PreEntry;
+ // ---------------------------------------------------------------------------
+ // Transform initial VPlan: Apply previously taken decisions, in order, to
+ // bring the VPlan to its final state.
+ // ---------------------------------------------------------------------------
+
+ // Apply Sink-After legal constraints.
+ for (auto &Entry : SinkAfter) {
+ VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
+ VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
+ Sink->moveAfter(Target);
+ }
+
+ // Interleave memory: for each Interleave Group we marked earlier as relevant
+ // for this VPlan, replace the Recipes widening its memory instructions with a
+ // single VPInterleaveRecipe at its insertion point.
+ for (auto IG : InterleaveGroups) {
+ auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
+ RecipeBuilder.getRecipe(IG->getInsertPos()));
+ (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
+ ->insertBefore(Recipe);
+
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *Member = IG->getMember(i)) {
+ RecipeBuilder.getRecipe(Member)->eraseFromParent();
+ }
+ }
+
// Finally, if tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the end of the latch.
if (CM.foldTailByMasking()) {
@@ -7255,9 +7315,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
}
SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanHCFGTransforms::VPInstructionsToVPRecipes(
- Plan, Legal->getInductionVars(), DeadInstructions);
-
+ VPlanTransforms::VPInstructionsToVPRecipes(
+ OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
return Plan;
}
@@ -7266,13 +7325,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) {
return ILV.getOrCreateVectorValue(V, Part);
}
+Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
+ Value *V, const VPIteration &Instance) {
+ return ILV.getOrCreateScalarValue(V, Instance);
+}
+
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
O << " +\n"
<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
- if (User) {
+ O << ", ";
+ getAddr()->printAsOperand(O);
+ VPValue *Mask = getMask();
+ if (Mask) {
O << ", ";
- User->getOperand(0)->printAsOperand(O);
+ Mask->printAsOperand(O);
}
O << "\\l\"";
for (unsigned i = 0; i < IG->getFactor(); ++i)
@@ -7286,6 +7353,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {
State.ILV->widenInstruction(Instr);
}
+void VPWidenGEPRecipe::execute(VPTransformState &State) {
+ State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
+ IsIndexLoopInvariant);
+}
+
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.");
State.ILV->widenIntOrFpInduction(IV, Trunc);
@@ -7336,15 +7408,8 @@ void VPBlendRecipe::execute(VPTransformState &State) {
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.");
- if (!User)
- return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
-
- // Last (and currently only) operand is a mask.
- InnerLoopVectorizer::VectorParts MaskValues(State.UF);
- VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
- for (unsigned Part = 0; Part < State.UF; ++Part)
- MaskValues[Part] = State.get(Mask, Part);
- State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
+ getMask());
}
void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7431,29 +7496,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
}
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
- if (!User)
- return State.ILV->vectorizeMemoryInstruction(&Instr);
-
- // Last (and currently only) operand is a mask.
- InnerLoopVectorizer::VectorParts MaskValues(State.UF);
- VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
- for (unsigned Part = 0; Part < State.UF; ++Part)
- MaskValues[Part] = State.get(Mask, Part);
- State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
-}
-
-static ScalarEpilogueLowering
-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
- ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
- ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
- SEL = CM_ScalarEpilogueNotAllowedOptSize;
- else if (PreferPredicateOverEpilog || Hints.getPredicate())
- SEL = CM_ScalarEpilogueNotNeededUsePredicate;
-
- return SEL;
+ State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
+}
+
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+ Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ LoopVectorizationLegality &LVL) {
+ bool OptSize =
+ F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ // 1) OptSize takes precedence over all other options, i.e. if this is set,
+ // don't look at hints or options, and don't request a scalar epilogue.
+ if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+ return CM_ScalarEpilogueNotAllowedOptSize;
+
+ bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
+ !PreferPredicateOverEpilog;
+
+ // 2) Next, if disabling predication is requested on the command line, honour
+ // this and request a scalar epilogue. Also do this if we don't have a
+ // primary induction variable, which is required for predication.
+ if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+ return CM_ScalarEpilogueAllowed;
+
+ // 3) and 4) look if enabling predication is requested on the command line,
+ // with a loop hint, or if the TTI hook indicates this is profitable, request
+ // predication .
+ if (PreferPredicateOverEpilog ||
+ Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
+ (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+ LVL.getLAI()) &&
+ Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+ return CM_ScalarEpilogueNotNeededUsePredicate;
+
+ return CM_ScalarEpilogueAllowed;
}
// Process the loop in the VPlan-native vectorization path. This path builds
@@ -7470,14 +7552,16 @@ static bool processLoopInVPlanNativePath(
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();
@@ -7562,7 +7646,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
@@ -7635,7 +7720,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();