src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-01-17 20:45:01 +0000
commit	706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree	4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
parent	7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)

vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085

Notes

Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')

-rw-r--r--

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1011

1 files changed, 548 insertions, 463 deletions

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8f0bf70f873c..684a3098e564 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -58,8 +58,8 @@

#include "VPRecipeBuilder.h"

#include "VPlan.h"

#include "VPlanHCFGBuilder.h"

-#include "VPlanHCFGTransforms.h"

#include "VPlanPredicator.h"

+#include "VPlanTransforms.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/DenseMap.h"

@@ -124,6 +124,7 @@

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/IR/Verifier.h"

+#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include "llvm/Support/CommandLine.h"

@@ -149,7 +150,6 @@

#include <string>

#include <tuple>

#include <utility>

-#include <vector>

using namespace llvm;

@@ -200,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(

"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,

cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

-/// We don't interleave loops with a known constant trip count below this

-/// number.

-static const unsigned TinyTripCountInterleaveThreshold = 128;

+static cl::opt<unsigned> TinyTripCountInterleaveThreshold(

+ "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,

+ cl::desc("We don't interleave loops with a estimated constant trip count "

+ "below this number"));

static cl::opt<unsigned> ForceTargetNumScalarRegs(

"force-target-num-scalar-regs", cl::init(0), cl::Hidden,

@@ -427,6 +428,11 @@ public:

/// new unrolled loop, where UF is the unroll factor.

using VectorParts = SmallVector<Value *, 2>;

+ /// Vectorize a single GetElementPtrInst based on information gathered and

+ /// decisions taken during planning.

+ void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,

+ bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);

/// Vectorize a single PHINode in a block. This method handles the induction

/// variable canonicalization. It supports both VF = 1 for unrolled loops and

/// arbitrary length vectors.

@@ -476,15 +482,20 @@ public:

/// Construct the vector value of a scalarized value \p V one lane at a time.

void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);

- /// Try to vectorize the interleaved access group that \p Instr belongs to,

- /// optionally masking the vector operations if \p BlockInMask is non-null.

- void vectorizeInterleaveGroup(Instruction *Instr,

- VectorParts *BlockInMask = nullptr);

- /// Vectorize Load and Store instructions, optionally masking the vector

- /// operations if \p BlockInMask is non-null.

- void vectorizeMemoryInstruction(Instruction *Instr,

- VectorParts *BlockInMask = nullptr);

+ /// Try to vectorize the interleaved access group that \p Instr belongs to

+ /// with the base address given in \p Addr, optionally masking the vector

+ /// operations if \p BlockInMask is non-null. Use \p State to translate given

+ /// VPValues to IR values in the vectorized loop.

+ void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,

+ VPValue *Addr, VPValue *BlockInMask = nullptr);

+ /// Vectorize Load and Store instructions with the base address given in \p

+ /// Addr, optionally masking the vector operations if \p BlockInMask is

+ /// non-null. Use \p State to translate given VPValues to IR values in the

+ /// vectorized loop.

+ void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,

+ VPValue *Addr,

+ VPValue *BlockInMask = nullptr);

/// Set the debug location in the builder using the debug location in

/// the instruction.

@@ -525,6 +536,9 @@ protected:

/// vectorizing this phi node.

void fixReduction(PHINode *Phi);

+ /// Clear NSW/NUW flags from reduction instructions if necessary.

+ void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);

/// The Loop exit block may have single value PHI nodes with some

/// incoming value. While vectorizing we only handled real values

/// that were defined inside the loop and we should have one value for

@@ -539,10 +553,6 @@ protected:

/// represented as.

void truncateToMinimalBitwidths();

- /// Insert the new loop to the loop hierarchy and pass manager

- /// and update the analysis passes.

- void updateAnalysis();

/// Create a broadcast instruction. This method generates a broadcast

/// instruction (shuffle) for loop invariant values and for the induction

/// value. If this is the induction variable then we extend it to N, N+1, ...

@@ -1204,14 +1214,14 @@ public:

/// Returns true if the target machine supports masked scatter operation

/// for the given \p DataType.

- bool isLegalMaskedScatter(Type *DataType) {

- return TTI.isLegalMaskedScatter(DataType);

+ bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {

+ return TTI.isLegalMaskedScatter(DataType, Alignment);

}

/// Returns true if the target machine supports masked gather operation

/// for the given \p DataType.

- bool isLegalMaskedGather(Type *DataType) {

- return TTI.isLegalMaskedGather(DataType);

+ bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {

+ return TTI.isLegalMaskedGather(DataType, Alignment);

}

/// Returns true if the target machine can represent \p V as a masked gather

@@ -1222,7 +1232,9 @@ public:

if (!LI && !SI)

return false;

auto *Ty = getMemInstValueType(V);

- return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));

+ MaybeAlign Align = getLoadStoreAlignment(V);

+ return (LI && isLegalMaskedGather(Ty, Align)) ||

+ (SI && isLegalMaskedScatter(Ty, Align));

}

/// Returns true if \p I is an instruction that will be scalarized with

@@ -2155,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {

// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements

// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B

void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

- VectorParts *BlockInMask) {

+ VPTransformState &State,

+ VPValue *Addr,

+ VPValue *BlockInMask) {

const InterleaveGroup<Instruction> *Group =

Cost->getInterleavedAccessGroup(Instr);

assert(Group && "Fail to get an interleaved access group.");

@@ -2165,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

return;

const DataLayout &DL = Instr->getModule()->getDataLayout();

- Value *Ptr = getLoadStorePointerOperand(Instr);

// Prepare for the vector type of the interleaved load/store.

Type *ScalarTy = getMemInstValueType(Instr);

unsigned InterleaveFactor = Group->getFactor();

Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);

- Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));

// Prepare for the new pointers.

- setDebugLocFromInst(Builder, Ptr);

- SmallVector<Value *, 2> NewPtrs;

+ SmallVector<Value *, 2> AddrParts;

unsigned Index = Group->getIndex(Instr);

- VectorParts Mask;

- bool IsMaskForCondRequired = BlockInMask;

- if (IsMaskForCondRequired) {

- Mask = *BlockInMask;

- // TODO: extend the masked interleaved-group support to reversed access.

- assert(!Group->isReverse() && "Reversed masked interleave-group "

- "not supported.");

- }

+ // TODO: extend the masked interleaved-group support to reversed access.

+ assert((!BlockInMask || !Group->isReverse()) &&

+ "Reversed masked interleave-group not supported.");

// If the group is reverse, adjust the index to refer to the last vector lane

// instead of the first. We adjust the index from the first vector lane,

@@ -2196,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

if (Group->isReverse())

Index += (VF - 1) * Group->getFactor();

- bool InBounds = false;

- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

- InBounds = gep->isInBounds();

for (unsigned Part = 0; Part < UF; Part++) {

- Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});

+ Value *AddrPart = State.get(Addr, {Part, 0});

+ setDebugLocFromInst(Builder, AddrPart);

// Notice current instruction could be any index. Need to adjust the address

// to the member of index 0.

@@ -2214,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

// A[i] = b; // Member of index 0

// A[i+2] = c; // Member of index 2 (Current instruction)

// Current pointer is pointed to A[i+2], adjust it to A[i].

- NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));

- if (InBounds)

- cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);

+ bool InBounds = false;

+ if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))

+ InBounds = gep->isInBounds();

+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));

+ cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);

// Cast to the vector pointer type.

- NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));

+ unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();

+ Type *PtrTy = VecTy->getPointerTo(AddressSpace);

+ AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));

}

setDebugLocFromInst(Builder, Instr);

@@ -2237,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

SmallVector<Value *, 2> NewLoads;

for (unsigned Part = 0; Part < UF; Part++) {

Instruction *NewLoad;

- if (IsMaskForCondRequired || MaskForGaps) {

+ if (BlockInMask || MaskForGaps) {

assert(useMaskedInterleavedAccesses(*TTI) &&

"masked interleaved groups are not allowed.");

Value *GroupMask = MaskForGaps;

- if (IsMaskForCondRequired) {

- auto *Undefs = UndefValue::get(Mask[Part]->getType());

+ if (BlockInMask) {

+ Value *BlockInMaskPart = State.get(BlockInMask, Part);

+ auto *Undefs = UndefValue::get(BlockInMaskPart->getType());

auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);

Value *ShuffledMask = Builder.CreateShuffleVector(

- Mask[Part], Undefs, RepMask, "interleaved.mask");

+ BlockInMaskPart, Undefs, RepMask, "interleaved.mask");

GroupMask = MaskForGaps

? Builder.CreateBinOp(Instruction::And, ShuffledMask,

MaskForGaps)

: ShuffledMask;

}

NewLoad =

- Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),

+ Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),

GroupMask, UndefVec, "wide.masked.vec");

}

else

- NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],

+ NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],

Group->getAlignment(), "wide.vec");

Group->addMetadata(NewLoad);

NewLoads.push_back(NewLoad);

@@ -2325,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

"interleaved.vec");

Instruction *NewStoreInstr;

- if (IsMaskForCondRequired) {

- auto *Undefs = UndefValue::get(Mask[Part]->getType());

+ if (BlockInMask) {

+ Value *BlockInMaskPart = State.get(BlockInMask, Part);

+ auto *Undefs = UndefValue::get(BlockInMaskPart->getType());

auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);

Value *ShuffledMask = Builder.CreateShuffleVector(

- Mask[Part], Undefs, RepMask, "interleaved.mask");

+ BlockInMaskPart, Undefs, RepMask, "interleaved.mask");

NewStoreInstr = Builder.CreateMaskedStore(

- IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);

+ IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);

}

else

- NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],

- Group->getAlignment());

+ NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],

+ Group->getAlignment());

Group->addMetadata(NewStoreInstr);

}

void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

- VectorParts *BlockInMask) {

+ VPTransformState &State,

+ VPValue *Addr,

+ VPValue *BlockInMask) {

// Attempt to issue a wide load.

LoadInst *LI = dyn_cast<LoadInst>(Instr);

StoreInst *SI = dyn_cast<StoreInst>(Instr);

@@ -2354,17 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

assert(Decision != LoopVectorizationCostModel::CM_Unknown &&

"CM decision should be taken at this point");

if (Decision == LoopVectorizationCostModel::CM_Interleave)

- return vectorizeInterleaveGroup(Instr);

+ return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);

Type *ScalarDataTy = getMemInstValueType(Instr);

Type *DataTy = VectorType::get(ScalarDataTy, VF);

- Value *Ptr = getLoadStorePointerOperand(Instr);

// An alignment of 0 means target abi alignment. We need to use the scalar's

// target abi alignment in such a case.

const DataLayout &DL = Instr->getModule()->getDataLayout();

const Align Alignment =

DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);

- unsigned AddressSpace = getLoadStoreAddressSpace(Instr);

// Determine if the pointer operand of the access is either consecutive or

// reverse consecutive.

@@ -2378,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

// gather/scatter. Otherwise Decision should have been to Scalarize.

assert((ConsecutiveStride || CreateGatherScatter) &&

"The instruction should be scalarized");

+ (void)ConsecutiveStride;

- // Handle consecutive loads/stores.

- if (ConsecutiveStride)

- Ptr = getOrCreateScalarValue(Ptr, {0, 0});

- VectorParts Mask;

+ VectorParts BlockInMaskParts(UF);

bool isMaskRequired = BlockInMask;

if (isMaskRequired)

- Mask = *BlockInMask;

- bool InBounds = false;

- if (auto *gep = dyn_cast<GetElementPtrInst>(

- getLoadStorePointerOperand(Instr)->stripPointerCasts()))

- InBounds = gep->isInBounds();

+ for (unsigned Part = 0; Part < UF; ++Part)

+ BlockInMaskParts[Part] = State.get(BlockInMask, Part);

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {

// Calculate the pointer for the specific unroll-part.

GetElementPtrInst *PartPtr = nullptr;

+ bool InBounds = false;

+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))

+ InBounds = gep->isInBounds();

if (Reverse) {

// If the address is consecutive but reversed, then the

// wide store needs to start at the last vector element.

@@ -2407,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));

PartPtr->setIsInBounds(InBounds);

if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

- Mask[Part] = reverseVector(Mask[Part]);

+ BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);

} else {

PartPtr = cast<GetElementPtrInst>(

Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));

PartPtr->setIsInBounds(InBounds);

}

+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));

};

@@ -2425,8 +2433,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

Instruction *NewSI = nullptr;

Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);

if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;

- Value *VectorGep = getOrCreateVectorValue(Ptr, Part);

+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

+ Value *VectorGep = State.get(Addr, Part);

NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,

Alignment.value(), MaskPart);

} else {

@@ -2437,10 +2445,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

// We don't want to update the value in the map as it might be used in

// another expression. So don't call resetVectorValue(StoredVal).

}

- auto *VecPtr = CreateVecPtr(Part, Ptr);

+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));

if (isMaskRequired)

- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,

- Alignment.value(), Mask[Part]);

+ NewSI = Builder.CreateMaskedStore(

+ StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);

else

NewSI =

Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());

@@ -2456,17 +2464,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

for (unsigned Part = 0; Part < UF; ++Part) {

Value *NewLI;

if (CreateGatherScatter) {

- Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;

- Value *VectorGep = getOrCreateVectorValue(Ptr, Part);

+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;

+ Value *VectorGep = State.get(Addr, Part);

NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,

nullptr, "wide.masked.gather");

addMetadata(NewLI, LI);

} else {

- auto *VecPtr = CreateVecPtr(Part, Ptr);

+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));

if (isMaskRequired)

- NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],

- UndefValue::get(DataTy),

- "wide.masked.load");

+ NewLI = Builder.CreateMaskedLoad(

+ VecPtr, Alignment.value(), BlockInMaskParts[Part],

+ UndefValue::get(DataTy), "wide.masked.load");

else

NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),

"wide.load");

@@ -2676,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,

void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,

BasicBlock *Bypass) {

Value *Count = getOrCreateTripCount(L);

- BasicBlock *BB = L->getLoopPreheader();

- IRBuilder<> Builder(BB->getTerminator());

+ // Reuse existing vector loop preheader for TC checks.

+ // Note that new preheader block is generated for vector loop.

+ BasicBlock *const TCCheckBlock = LoopVectorPreHeader;

+ IRBuilder<> Builder(TCCheckBlock->getTerminator());

// Generate code to check if the loop's trip count is less than VF * UF, or

// equal to it in case a scalar epilogue is required; this implies that the

@@ -2694,48 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,

P, Count, ConstantInt::get(Count->getType(), VF * UF),

"min.iters.check");

- BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

- // Update dominator tree immediately if the generated block is a

- // LoopBypassBlock because SCEV expansions to generate loop bypass

- // checks may query it before the current function is finished.

- DT->addNewBlock(NewBB, BB);

- if (L->getParentLoop())

- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);

- ReplaceInstWithInst(BB->getTerminator(),

- BranchInst::Create(Bypass, NewBB, CheckMinIters));

- LoopBypassBlocks.push_back(BB);

+ // Create new preheader for vector loop.

+ LoopVectorPreHeader =

+ SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,

+ "vector.ph");

+ assert(DT->properlyDominates(DT->getNode(TCCheckBlock),

+ DT->getNode(Bypass)->getIDom()) &&

+ "TC check is expected to dominate Bypass");

+ // Update dominator for Bypass & LoopExit.

+ DT->changeImmediateDominator(Bypass, TCCheckBlock);

+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);

+ ReplaceInstWithInst(

+ TCCheckBlock->getTerminator(),

+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));

+ LoopBypassBlocks.push_back(TCCheckBlock);

}

void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {

- BasicBlock *BB = L->getLoopPreheader();

+ // Reuse existing vector loop preheader for SCEV checks.

+ // Note that new preheader block is generated for vector loop.

+ BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;

// Generate the code to check that the SCEV assumptions that we made.

// We want the new basic block to start at the first instruction in a

// sequence of instructions that form a check.

SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),

"scev.check");

- Value *SCEVCheck =

- Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());

+ Value *SCEVCheck = Exp.expandCodeForPredicate(

+ &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());

if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))

if (C->isZero())

return;

- assert(!BB->getParent()->hasOptSize() &&

+ assert(!SCEVCheckBlock->getParent()->hasOptSize() &&

"Cannot SCEV check stride or overflow when optimizing for size");

- // Create a new block containing the stride check.

- BB->setName("vector.scevcheck");

- auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

- // Update dominator tree immediately if the generated block is a

- // LoopBypassBlock because SCEV expansions to generate loop bypass

- // checks may query it before the current function is finished.

- DT->addNewBlock(NewBB, BB);

- if (L->getParentLoop())

- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);

- ReplaceInstWithInst(BB->getTerminator(),

- BranchInst::Create(Bypass, NewBB, SCEVCheck));

- LoopBypassBlocks.push_back(BB);

+ SCEVCheckBlock->setName("vector.scevcheck");

+ // Create new preheader for vector loop.

+ LoopVectorPreHeader =

+ SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,

+ nullptr, "vector.ph");

+ // Update dominator only if this is first RT check.

+ if (LoopBypassBlocks.empty()) {

+ DT->changeImmediateDominator(Bypass, SCEVCheckBlock);

+ DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);

+ }

+ ReplaceInstWithInst(

+ SCEVCheckBlock->getTerminator(),

+ BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));

+ LoopBypassBlocks.push_back(SCEVCheckBlock);

AddedSafetyChecks = true;

}

@@ -2744,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

if (EnableVPlanNativePath)

return;

- BasicBlock *BB = L->getLoopPreheader();

+ // Reuse existing vector loop preheader for runtime memory checks.

+ // Note that new preheader block is generated for vector loop.

+ BasicBlock *const MemCheckBlock = L->getLoopPreheader();

// Generate the code that checks in runtime if arrays overlap. We put the

// checks into a separate block to make the more common case of few elements

@@ -2752,11 +2777,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

Instruction *FirstCheckInst;

Instruction *MemRuntimeCheck;

std::tie(FirstCheckInst, MemRuntimeCheck) =

- Legal->getLAI()->addRuntimeChecks(BB->getTerminator());

+ Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());

if (!MemRuntimeCheck)

return;

- if (BB->getParent()->hasOptSize()) {

+ if (MemCheckBlock->getParent()->hasOptSize()) {

assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&

"Cannot emit memory checks when optimizing for size, unless forced "

"to vectorize.");

@@ -2770,24 +2795,28 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

});

}

- // Create a new block containing the memory check.

- BB->setName("vector.memcheck");

- auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

- // Update dominator tree immediately if the generated block is a

- // LoopBypassBlock because SCEV expansions to generate loop bypass

- // checks may query it before the current function is finished.

- DT->addNewBlock(NewBB, BB);

- if (L->getParentLoop())

- L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);

- ReplaceInstWithInst(BB->getTerminator(),

- BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));

- LoopBypassBlocks.push_back(BB);

+ MemCheckBlock->setName("vector.memcheck");

+ // Create new preheader for vector loop.

+ LoopVectorPreHeader =

+ SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,

+ "vector.ph");

+ // Update dominator only if this is first RT check.

+ if (LoopBypassBlocks.empty()) {

+ DT->changeImmediateDominator(Bypass, MemCheckBlock);

+ DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);

+ }

+ ReplaceInstWithInst(

+ MemCheckBlock->getTerminator(),

+ BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));

+ LoopBypassBlocks.push_back(MemCheckBlock);

AddedSafetyChecks = true;

// We currently don't use LoopVersioning for the actual loop cloning but we

// still use it to add the noalias metadata.

LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,

- PSE.getSE());

+ PSE.getSE());

LVer->prepareNoAliasMetadata();

}

@@ -2912,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

...

- BasicBlock *OldBasicBlock = OrigLoop->getHeader();

- BasicBlock *VectorPH = OrigLoop->getLoopPreheader();

- BasicBlock *ExitBlock = OrigLoop->getExitBlock();

MDNode *OrigLoopID = OrigLoop->getLoopID();

- assert(VectorPH && "Invalid loop structure");

- assert(ExitBlock && "Must have an exit block");

// Some loops have a single integer induction variable, while other loops

// don't. One example is c++ iterators that often have multiple pointer

@@ -2934,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

Type *IdxTy = Legal->getWidestInductionType();

// Split the single block loop into the two loop structure described above.

- BasicBlock *VecBody =

- VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");

- BasicBlock *MiddleBlock =

- VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");

- BasicBlock *ScalarPH =

- MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");

+ LoopScalarBody = OrigLoop->getHeader();

+ LoopVectorPreHeader = OrigLoop->getLoopPreheader();

+ LoopExitBlock = OrigLoop->getExitBlock();

+ assert(LoopExitBlock && "Must have an exit block");

+ assert(LoopVectorPreHeader && "Invalid loop structure");

+ LoopMiddleBlock =

+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,

+ LI, nullptr, "middle.block");

+ LoopScalarPreHeader =

+ SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,

+ nullptr, "scalar.ph");

+ // We intentionally don't let SplitBlock to update LoopInfo since

+ // LoopVectorBody should belong to another loop than LoopVectorPreHeader.

+ // LoopVectorBody is explicitly added to the correct place few lines later.

+ LoopVectorBody =

+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,

+ nullptr, nullptr, "vector.body");

+ // Update dominator for loop exit.

+ DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

// Create and register the new vector loop.

Loop *Lp = LI->AllocateLoop();

@@ -2949,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// before calling any utilities such as SCEV that require valid LoopInfo.

if (ParentLoop) {

ParentLoop->addChildLoop(Lp);

- ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);

- ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);

} else {

LI->addTopLevelLoop(Lp);

}

- Lp->addBasicBlockToLoop(VecBody, *LI);

+ Lp->addBasicBlockToLoop(LoopVectorBody, *LI);

// Find the loop boundaries.

Value *Count = getOrCreateTripCount(Lp);

@@ -2966,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// backedge-taken count is uint##_max: adding one to it will overflow leading

// to an incorrect trip count of zero. In this (rare) case we will also jump

// to the scalar loop.

- emitMinimumIterationCountCheck(Lp, ScalarPH);

+ emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);

// Generate the code to check any assumptions that we've made for SCEV

// expressions.

- emitSCEVChecks(Lp, ScalarPH);

+ emitSCEVChecks(Lp, LoopScalarPreHeader);

// Generate the code that checks in runtime if arrays overlap. We put the

// checks into a separate block to make the more common case of few elements

// faster.

- emitMemRuntimeChecks(Lp, ScalarPH);

+ emitMemRuntimeChecks(Lp, LoopScalarPreHeader);

// Generate the induction variable.

// The loop step is equal to the vectorization factor (num of SIMD elements)

@@ -3003,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

InductionDescriptor II = InductionEntry.second;

// Create phi nodes to merge from the backedge-taken check block.

- PHINode *BCResumeVal = PHINode::Create(

- OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());

+ PHINode *BCResumeVal =

+ PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",

+ LoopScalarPreHeader->getTerminator());

// Copy original phi DL over to the new one.

BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());

Value *&EndValue = IVEndValues[OrigPhi];

@@ -3015,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());

Type *StepType = II.getStep()->getType();

Instruction::CastOps CastOp =

- CastInst::getCastOpcode(CountRoundDown, true, StepType, true);

+ CastInst::getCastOpcode(CountRoundDown, true, StepType, true);

Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");

- const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

+ const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();

EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);

EndValue->setName("ind.end");

}

// The new PHI merges the original incoming value, in case of a bypass,

// or the value at the end of the vectorized loop.

- BCResumeVal->addIncoming(EndValue, MiddleBlock);

+ BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);

// Fix the scalar body counter (PHI node).

// The old induction's phi node in the scalar body needs the truncated

// value.

for (BasicBlock *BB : LoopBypassBlocks)

BCResumeVal->addIncoming(II.getStartValue(), BB);

- OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);

+ OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);

}

// We need the OrigLoop (scalar loop part) latch terminator to help

@@ -3049,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

// If tail is to be folded, we know we don't need to run the remainder.

Value *CmpN = Builder.getTrue();

if (!Cost->foldTailByMasking()) {

- CmpN =

- CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,

- CountRoundDown, "cmp.n", MiddleBlock->getTerminator());

+ CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,

+ CountRoundDown, "cmp.n",

+ LoopMiddleBlock->getTerminator());

// Here we use the same DebugLoc as the scalar loop latch branch instead

// of the corresponding compare because they may have ended up with

@@ -3060,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());

}

- BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);

+ BranchInst *BrInst =

+ BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);

BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());

- ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);

+ ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);

// Get ready to start creating new instructions into the vectorized body.

- Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());

- // Save the state.

- LoopVectorPreHeader = Lp->getLoopPreheader();

- LoopScalarPreHeader = ScalarPH;

- LoopMiddleBlock = MiddleBlock;

- LoopExitBlock = ExitBlock;

- LoopVectorBody = VecBody;

- LoopScalarBody = OldBasicBlock;

+ assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&

+ "Inconsistent vector loop preheader");

+ Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());

Optional<MDNode *> VectorizedLoopID =

makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

@@ -3094,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

LoopVectorizeHints Hints(Lp, true, *ORE);

Hints.setAlreadyVectorized();

+#ifdef EXPENSIVE_CHECKS

+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));

+ LI->verify(*DT);

+#endif

return LoopVectorPreHeader;

}

@@ -3429,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() {

// This is the second stage of vectorizing recurrences.

fixCrossIterationPHIs();

- // Update the dominator tree.

- //

- // FIXME: After creating the structure of the new loop, the dominator tree is

- // no longer up-to-date, and it remains that way until we update it

- // here. An out-of-date dominator tree is problematic for SCEV,

- // because SCEVExpander uses it to guide code generation. The

- // vectorizer use SCEVExpanders in several places. Instead, we should

- // keep the dominator tree up-to-date as we go.

- updateAnalysis();

+ // Forget the original basic block.

+ PSE.getSE()->forgetLoop(OrigLoop);

// Fix-up external users of the induction variables.

for (auto &Entry : *Legal->getInductionVars())

@@ -3550,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

// among all unrolled iterations, due to the order of their construction.

Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);

- // Set the insertion point after the previous value if it is an instruction.

+ // Find and set the insertion point after the previous value if it is an

+ // instruction.

+ BasicBlock::iterator InsertPt;

// Note that the previous value may have been constant-folded so it is not

- // guaranteed to be an instruction in the vector loop. Also, if the previous

- // value is a phi node, we should insert after all the phi nodes to avoid

- // breaking basic block verification.

- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||

- isa<PHINode>(PreviousLastPart))

- Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());

- else

- Builder.SetInsertPoint(

- &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));

+ // guaranteed to be an instruction in the vector loop.

+ // FIXME: Loop invariant values do not form recurrences. We should deal with

+ // them earlier.

+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))

+ InsertPt = LoopVectorBody->getFirstInsertionPt();

+ else {

+ Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);

+ if (isa<PHINode>(PreviousLastPart))

+ // If the previous value is a phi node, we should insert after all the phi

+ // nodes in the block containing the PHI to avoid breaking basic block

+ // verification. Note that the basic block may be different to

+ // LoopVectorBody, in case we predicate the loop.

+ InsertPt = PreviousInst->getParent()->getFirstInsertionPt();

+ else

+ InsertPt = ++PreviousInst->getIterator();

+ }

+ Builder.SetInsertPoint(&*InsertPt);

// We will construct a vector for the recurrence by combining the values for

// the current and previous iterations. This is the required shuffle mask.

@@ -3693,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

}

+ // Wrap flags are in general invalid after vectorization, clear them.

+ clearReductionWrapFlags(RdxDesc);

// Fix the vector-loop phi.

// Reductions do not have to start at zero. They can start with

// any loop invariant values.

BasicBlock *Latch = OrigLoop->getLoopLatch();

Value *LoopVal = Phi->getIncomingValueForBlock(Latch);

for (unsigned Part = 0; Part < UF; ++Part) {

Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);

Value *Val = getOrCreateVectorValue(LoopVal, Part);

- // Make sure to add the reduction stat value only to the

+ // Make sure to add the reduction start value only to the

// first unroll part.

Value *StartVal = (Part == 0) ? VectorStart : Identity;

cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);

@@ -3839,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);

}

+void InnerLoopVectorizer::clearReductionWrapFlags(

+ RecurrenceDescriptor &RdxDesc) {

+ RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();

+ if (RK != RecurrenceDescriptor::RK_IntegerAdd &&

+ RK != RecurrenceDescriptor::RK_IntegerMult)

+ return;

+ Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();

+ assert(LoopExitInstr && "null loop exit instruction");

+ SmallVector<Instruction *, 8> Worklist;

+ SmallPtrSet<Instruction *, 8> Visited;

+ Worklist.push_back(LoopExitInstr);

+ Visited.insert(LoopExitInstr);

+ while (!Worklist.empty()) {

+ Instruction *Cur = Worklist.pop_back_val();

+ if (isa<OverflowingBinaryOperator>(Cur))

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *V = getOrCreateVectorValue(Cur, Part);

+ cast<Instruction>(V)->dropPoisonGeneratingFlags();

+ }

+ for (User *U : Cur->users()) {

+ Instruction *UI = cast<Instruction>(U);

+ if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&

+ Visited.insert(UI).second)

+ Worklist.push_back(UI);

+ }

void InnerLoopVectorizer::fixLCSSAPHIs() {

for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {

if (LCSSAPhi.getNumIncomingValues() == 1) {

@@ -3960,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {

}

+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,

+ unsigned VF, bool IsPtrLoopInvariant,

+ SmallBitVector &IsIndexLoopInvariant) {

+ // Construct a vector GEP by widening the operands of the scalar GEP as

+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

+ // results in a vector of pointers when at least one operand of the GEP

+ // is vector-typed. Thus, to keep the representation compact, we only use

+ // vector-typed operands for loop-varying values.

+ if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {

+ // If we are vectorizing, but the GEP has only loop-invariant operands,

+ // the GEP we build (by only using vector-typed operands for

+ // loop-varying values) would be a scalar pointer. Thus, to ensure we

+ // produce a vector of pointers, we need to either arbitrarily pick an

+ // operand to broadcast, or broadcast a clone of the original GEP.

+ // Here, we broadcast a clone of the original.

+ //

+ // TODO: If at some point we decide to scalarize instructions having

+ // loop-invariant operands, this special case will no longer be

+ // required. We would add the scalarization decision to

+ // collectLoopScalars() and teach getVectorValue() to broadcast

+ // the lane-zero scalar value.

+ auto *Clone = Builder.Insert(GEP->clone());

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);

+ VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);

+ addMetadata(EntryPart, GEP);

+ }

+ } else {

+ // If the GEP has at least one loop-varying operand, we are sure to

+ // produce a vector of pointers. But if we are only unrolling, we want

+ // to produce a scalar GEP for each unroll part. Thus, the GEP we

+ // produce with the code below will be scalar (if VF == 1) or vector

+ // (otherwise). Note that for the unroll-only case, we still maintain

+ // values in the vector mapping with initVector, as we do for other

+ // instructions.

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ // The pointer operand of the new GEP. If it's loop-invariant, we

+ // won't broadcast it.

+ auto *Ptr = IsPtrLoopInvariant

+ ? GEP->getPointerOperand()

+ : getOrCreateVectorValue(GEP->getPointerOperand(), Part);

+ // Collect all the indices for the new GEP. If any index is

+ // loop-invariant, we won't broadcast it.

+ SmallVector<Value *, 4> Indices;

+ for (auto Index : enumerate(GEP->indices())) {

+ Value *User = Index.value().get();

+ if (IsIndexLoopInvariant[Index.index()])

+ Indices.push_back(User);

+ else

+ Indices.push_back(getOrCreateVectorValue(User, Part));

+ }

+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

+ // but it should be a vector, otherwise.

+ auto *NewGEP =

+ GEP->isInBounds()

+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,

+ Indices)

+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);

+ assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&

+ "NewGEP is not a pointer vector");

+ VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);

+ addMetadata(NewGEP, GEP);

+ }

void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

unsigned VF) {

PHINode *P = cast<PHINode>(PN);

@@ -4062,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

switch (I.getOpcode()) {

case Instruction::Br:

case Instruction::PHI:

+ case Instruction::GetElementPtr:

llvm_unreachable("This instruction is handled by a different recipe.");

- case Instruction::GetElementPtr: {

- // Construct a vector GEP by widening the operands of the scalar GEP as

- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

- // results in a vector of pointers when at least one operand of the GEP

- // is vector-typed. Thus, to keep the representation compact, we only use

- // vector-typed operands for loop-varying values.

- auto *GEP = cast<GetElementPtrInst>(&I);

- if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {

- // If we are vectorizing, but the GEP has only loop-invariant operands,

- // the GEP we build (by only using vector-typed operands for

- // loop-varying values) would be a scalar pointer. Thus, to ensure we

- // produce a vector of pointers, we need to either arbitrarily pick an

- // operand to broadcast, or broadcast a clone of the original GEP.

- // Here, we broadcast a clone of the original.

- //

- // TODO: If at some point we decide to scalarize instructions having

- // loop-invariant operands, this special case will no longer be

- // required. We would add the scalarization decision to

- // collectLoopScalars() and teach getVectorValue() to broadcast

- // the lane-zero scalar value.

- auto *Clone = Builder.Insert(GEP->clone());

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);

- VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);

- addMetadata(EntryPart, GEP);

- }

- } else {

- // If the GEP has at least one loop-varying operand, we are sure to

- // produce a vector of pointers. But if we are only unrolling, we want

- // to produce a scalar GEP for each unroll part. Thus, the GEP we

- // produce with the code below will be scalar (if VF == 1) or vector

- // (otherwise). Note that for the unroll-only case, we still maintain

- // values in the vector mapping with initVector, as we do for other

- // instructions.

- for (unsigned Part = 0; Part < UF; ++Part) {

- // The pointer operand of the new GEP. If it's loop-invariant, we

- // won't broadcast it.

- auto *Ptr =

- OrigLoop->isLoopInvariant(GEP->getPointerOperand())

- ? GEP->getPointerOperand()

- : getOrCreateVectorValue(GEP->getPointerOperand(), Part);

- // Collect all the indices for the new GEP. If any index is

- // loop-invariant, we won't broadcast it.

- SmallVector<Value *, 4> Indices;

- for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {

- if (OrigLoop->isLoopInvariant(U.get()))

- Indices.push_back(U.get());

- else

- Indices.push_back(getOrCreateVectorValue(U.get(), Part));

- }

- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

- // but it should be a vector, otherwise.

- auto *NewGEP =

- GEP->isInBounds()

- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,

- Indices)

- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);

- assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&

- "NewGEP is not a pointer vector");

- VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);

- addMetadata(NewGEP, GEP);

- }

- break;

- }

case Instruction::UDiv:

case Instruction::SDiv:

case Instruction::SRem:

@@ -4335,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

} // end of switch.

}

-void InnerLoopVectorizer::updateAnalysis() {

- // Forget the original basic block.

- PSE.getSE()->forgetLoop(OrigLoop);

- // DT is not kept up-to-date for outer loop vectorization

- if (EnableVPlanNativePath)

- return;

- // Update the dominator tree information.

- assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&

- "Entry does not dominate exit.");

- DT->addNewBlock(LoopMiddleBlock,

- LI->getLoopFor(LoopVectorBody)->getLoopLatch());

- DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);

- DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);

- DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);

- assert(DT->verify(DominatorTree::VerificationLevel::Fast));

void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {

// We should not collect Scalars more than once per VF. Right now, this

// function is called from collectUniformsAndScalars(), which already does

@@ -4562,9 +4619,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne

return WideningDecision == CM_Scalarize;

}

const MaybeAlign Alignment = getLoadStoreAlignment(I);

- return isa<LoadInst>(I) ?

- !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))

- : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));

+ return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||

+ isLegalMaskedGather(Ty, Alignment))

+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||

+ isLegalMaskedScatter(Ty, Alignment));

}

case Instruction::UDiv:

case Instruction::SDiv:

@@ -4667,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

SetVector<Instruction *> Worklist;

BasicBlock *Latch = TheLoop->getLoopLatch();

+ // Instructions that are scalar with predication must not be considered

+ // uniform after vectorization, because that would create an erroneous

+ // replicating region where only a single instance out of VF should be formed.

+ // TODO: optimize such seldom cases if found important, see PR40816.

+ auto addToWorklistIfAllowed = [&](Instruction *I) -> void {

+ if (isScalarWithPredication(I, VF)) {

+ LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "

+ << *I << "\n");

+ return;

+ }

+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");

+ Worklist.insert(I);

+ };

// Start with the conditional branch. If the branch condition is an

// instruction contained in the loop that is only used by the branch, it is

// uniform.

auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));

- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {

- Worklist.insert(Cmp);

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");

- }

+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())

+ addToWorklistIfAllowed(Cmp);

// Holds consecutive and consecutive-like pointers. Consecutive-like pointers

// are pointers that are treated like consecutive pointers during

@@ -4733,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

// Add to the Worklist all consecutive and consecutive-like pointers that

// aren't also identified as possibly non-uniform.

for (auto *V : ConsecutiveLikePtrs)

- if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");

- Worklist.insert(V);

- }

+ if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())

+ addToWorklistIfAllowed(V);

// Expand Worklist in topological order: whenever a new instruction

// is added , its users should be already inside Worklist. It ensures

@@ -4762,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

return Worklist.count(J) ||

(OI == getLoadStorePointerOperand(J) &&

isUniformDecision(J, VF));

- })) {

- Worklist.insert(OI);

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");

- }

+ }))

+ addToWorklistIfAllowed(OI);

}

@@ -4807,11 +4873,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

continue;

// The induction variable and its update instruction will remain uniform.

- Worklist.insert(Ind);

- Worklist.insert(IndUpdate);

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");

- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate

- << "\n");

+ addToWorklistIfAllowed(Ind);

+ addToWorklistIfAllowed(IndUpdate);

}

Uniforms[VF].insert(Worklist.begin(), Worklist.end());

@@ -5143,9 +5206,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,

if (Legal->getMaxSafeDepDistBytes() != -1U)

return 1;

- // Do not interleave loops with a relatively small trip count.

- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

- if (TC > 1 && TC < TinyTripCountInterleaveThreshold)

+ // Do not interleave loops with a relatively small known or estimated trip

+ // count.

+ auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);

+ if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)

return 1;

RegisterUsage R = calculateRegisterUsage({VF})[0];

@@ -5208,12 +5272,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,

MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;

}

- // If the trip count is constant, limit the interleave count to be less than

- // the trip count divided by VF.

- if (TC > 0) {

- assert(TC >= VF && "VF exceeds trip count?");

- if ((TC / VF) < MaxInterleaveCount)

- MaxInterleaveCount = (TC / VF);

+ // If trip count is known or estimated compile time constant, limit the

+ // interleave count to be less than the trip count divided by VF.

+ if (BestKnownTC) {

+ MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);

}

// If we did not calculate the cost for VF (because the user selected the VF)

@@ -5746,7 +5808,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

// vectorized loop where the user of it is a vectorized instruction.

const MaybeAlign Alignment = getLoadStoreAlignment(I);

Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),

- Alignment ? Alignment->value() : 0, AS);

+ Alignment, AS);

// Get the overhead of the extractelement and insertelement instructions

// we might create due to scalarization.

@@ -5783,8 +5845,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,

Alignment ? Alignment->value() : 0, AS);

else

- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,

- Alignment ? Alignment->value() : 0, AS, I);

+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

bool Reverse = ConsecutiveStride < 0;

if (Reverse)

@@ -5800,16 +5861,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

unsigned AS = getLoadStoreAddressSpace(I);

if (isa<LoadInst>(I)) {

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(Instruction::Load, ValTy,

- Alignment ? Alignment->value() : 0, AS) +

+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +

TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);

}

StoreInst *SI = cast<StoreInst>(I);

bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(Instruction::Store, ValTy,

- Alignment ? Alignment->value() : 0, AS) +

+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +

(isLoopInvariantStoreValue

? 0

: TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,

@@ -5877,8 +5936,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

unsigned AS = getLoadStoreAddressSpace(I);

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(I->getOpcode(), ValTy,

- Alignment ? Alignment->value() : 0, AS, I);

+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);

}

return getWideningCost(I, VF);

}

@@ -6217,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;

return N * TTI.getArithmeticInstrCost(

I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,

- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);

+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);

}

case Instruction::FNeg: {

unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;

@@ -6225,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,

TargetTransformInfo::OK_AnyValue,

TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,

- I->getOperand(0));

+ I->getOperand(0), I);

}

case Instruction::Select: {

SelectInst *SI = cast<SelectInst>(I);

@@ -6714,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {

return BlockMaskCache[BB] = BlockMask;

}

-VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,

- VFRange &Range,

- VPlanPtr &Plan) {

- const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);

- if (!IG)

- return nullptr;

- // Now check if IG is relevant for VF's in the given range.

- auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {

- return [=](unsigned VF) -> bool {

- return (VF >= 2 && // Query is illegal for VF == 1

- CM.getWideningDecision(I, VF) ==

- LoopVectorizationCostModel::CM_Interleave);

- };

- if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))

- return nullptr;

- // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)

- // range. If it's the primary member of the IG construct a VPInterleaveRecipe.

- // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.

- assert(I == IG->getInsertPos() &&

- "Generating a recipe for an adjunct member of an interleave group");

- VPValue *Mask = nullptr;

- if (Legal->isMaskRequired(I))

- Mask = createBlockInMask(I->getParent(), Plan);

- return new VPInterleaveRecipe(IG, Mask);

VPWidenMemoryInstructionRecipe *

VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,

VPlanPtr &Plan) {

@@ -6754,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,

auto willWiden = [&](unsigned VF) -> bool {

if (VF == 1)

return false;

- if (CM.isScalarAfterVectorization(I, VF) ||

- CM.isProfitableToScalarize(I, VF))

- return false;

LoopVectorizationCostModel::InstWidening Decision =

CM.getWideningDecision(I, VF);

assert(Decision != LoopVectorizationCostModel::CM_Unknown &&

"CM decision should be taken at this point.");

- assert(Decision != LoopVectorizationCostModel::CM_Interleave &&

- "Interleave memory opportunity should be caught earlier.");

+ if (Decision == LoopVectorizationCostModel::CM_Interleave)

+ return true;

+ if (CM.isScalarAfterVectorization(I, VF) ||

+ CM.isProfitableToScalarize(I, VF))

+ return false;

return Decision != LoopVectorizationCostModel::CM_Scalarize;

};

@@ -6773,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,

if (Legal->isMaskRequired(I))

Mask = createBlockInMask(I->getParent(), Plan);

- return new VPWidenMemoryInstructionRecipe(*I, Mask);

+ VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));

+ return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);

}

VPWidenIntOrFpInductionRecipe *

@@ -6861,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,

case Instruction::FPTrunc:

case Instruction::FRem:

case Instruction::FSub:

- case Instruction::GetElementPtr:

case Instruction::ICmp:

case Instruction::IntToPtr:

case Instruction::Load:

@@ -6926,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))

return false;

+ // If this ingredient's recipe is to be recorded, keep its recipe a singleton

+ // to avoid having to split recipes later.

+ bool IsSingleton = Ingredient2Recipe.count(I);

+ // Success: widen this instruction.

- // Success: widen this instruction. We optimize the common case where

+ // Use the default widening recipe. We optimize the common case where

// consecutive instructions can be represented by a single recipe.

- if (!VPBB->empty()) {

- VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());

- if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))

- return true;

- }

+ if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&

+ LastExtensibleRecipe->appendInstruction(I))

+ return true;

- VPBB->appendRecipe(new VPWidenRecipe(I));

+ VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);

+ if (!IsSingleton)

+ LastExtensibleRecipe = WidenRecipe;

+ setRecipe(I, WidenRecipe);

+ VPBB->appendRecipe(WidenRecipe);

return true;

}

@@ -6951,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(

[&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);

auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);

+ setRecipe(I, Recipe);

// Find if I uses a predicated instruction. If so, it will use its scalar

// value. Avoid hoisting the insert-element which packs the scalar value into

@@ -7009,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,

bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,

VPlanPtr &Plan, VPBasicBlock *VPBB) {

VPRecipeBase *Recipe = nullptr;

- // Check if Instr should belong to an interleave memory recipe, or already

- // does. In the latter case Instr is irrelevant.

- if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {

- VPBB->appendRecipe(Recipe);

- return true;

- }

- // Check if Instr is a memory operation that should be widened.

- if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {

+ // First, check for specific widening recipes that deal with memory

+ // operations, inductions and Phi nodes.

+ if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||

+ (Recipe = tryToOptimizeInduction(Instr, Range)) ||

+ (Recipe = tryToBlend(Instr, Plan)) ||

+ (isa<PHINode>(Instr) &&

+ (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {

+ setRecipe(Instr, Recipe);

VPBB->appendRecipe(Recipe);

return true;

}

- // Check if Instr should form some PHI recipe.

- if ((Recipe = tryToOptimizeInduction(Instr, Range))) {

- VPBB->appendRecipe(Recipe);

- return true;

- }

- if ((Recipe = tryToBlend(Instr, Plan))) {

+ // Handle GEP widening.

+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {

+ auto Scalarize = [&](unsigned VF) {

+ return CM.isScalarWithPredication(Instr, VF) ||

+ CM.isScalarAfterVectorization(Instr, VF) ||

+ CM.isProfitableToScalarize(Instr, VF);

+ };

+ if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))

+ return false;

+ VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);

+ setRecipe(Instr, Recipe);

VPBB->appendRecipe(Recipe);

return true;

}

- if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {

- VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));

- return true;

- }

// Check if Instr is to be widened by a general VPWidenRecipe, after

- // having first checked for specific widening recipes that deal with

- // Interleave Groups, Inductions and Phi nodes.

+ // having first checked for specific widening recipes.

if (tryToWiden(Instr, VPBB, Range))

return true;

@@ -7094,19 +7129,57 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,

VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,

SmallPtrSetImpl<Instruction *> &DeadInstructions) {

// Hold a mapping from predicated instructions to their recipes, in order to

// fix their AlsoPack behavior if a user is determined to replicate and use a

// scalar instead of vector value.

DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;

DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();

- DenseMap<Instruction *, Instruction *> SinkAfterInverse;

+ SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

+ // ---------------------------------------------------------------------------

+ // Pre-construction: record ingredients whose recipes we'll need to further

+ // process after constructing the initial VPlan.

+ // ---------------------------------------------------------------------------

+ // Mark instructions we'll need to sink later and their targets as

+ // ingredients whose recipe we'll need to record.

+ for (auto &Entry : SinkAfter) {

+ RecipeBuilder.recordRecipeOf(Entry.first);

+ RecipeBuilder.recordRecipeOf(Entry.second);

+ }

+ // For each interleave group which is relevant for this (possibly trimmed)

+ // Range, add it to the set of groups to be later applied to the VPlan and add

+ // placeholders for its members' Recipes which we'll be replacing with a

+ // single VPInterleaveRecipe.

+ for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {

+ auto applyIG = [IG, this](unsigned VF) -> bool {

+ return (VF >= 2 && // Query is illegal for VF == 1

+ CM.getWideningDecision(IG->getInsertPos(), VF) ==

+ LoopVectorizationCostModel::CM_Interleave);

+ };

+ if (!getDecisionAndClampRange(applyIG, Range))

+ continue;

+ InterleaveGroups.insert(IG);

+ for (unsigned i = 0; i < IG->getFactor(); i++)

+ if (Instruction *Member = IG->getMember(i))

+ RecipeBuilder.recordRecipeOf(Member);

+ };

+ // ---------------------------------------------------------------------------

+ // Build initial VPlan: Scan the body of the loop in a topological order to

+ // visit each basic block after having visited its predecessor basic blocks.

+ // ---------------------------------------------------------------------------

// Create a dummy pre-entry VPBasicBlock to start building the VPlan.

VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");

auto Plan = std::make_unique<VPlan>(VPBB);

- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

// Represent values that will have defs inside VPlan.

for (Value *V : NeedDef)

Plan->addVPValue(V);

@@ -7125,10 +7198,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBB = FirstVPBBForBB;

Builder.setInsertPoint(VPBB);

- std::vector<Instruction *> Ingredients;

- // Organize the ingredients to vectorize from current basic block in the

- // right order.

+ // Introduce each ingredient into VPlan.

for (Instruction &I : BB->instructionsWithoutDebug()) {

Instruction *Instr = &I;

@@ -7138,43 +7208,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

DeadInstructions.find(Instr) != DeadInstructions.end())

continue;

- // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct

- // member of the IG, do not construct any Recipe for it.

- const InterleaveGroup<Instruction> *IG =

- CM.getInterleavedAccessGroup(Instr);

- if (IG && Instr != IG->getInsertPos() &&

- Range.Start >= 2 && // Query is illegal for VF == 1

- CM.getWideningDecision(Instr, Range.Start) ==

- LoopVectorizationCostModel::CM_Interleave) {

- auto SinkCandidate = SinkAfterInverse.find(Instr);

- if (SinkCandidate != SinkAfterInverse.end())

- Ingredients.push_back(SinkCandidate->second);

- continue;

- }

- // Move instructions to handle first-order recurrences, step 1: avoid

- // handling this instruction until after we've handled the instruction it

- // should follow.

- auto SAIt = SinkAfter.find(Instr);

- if (SAIt != SinkAfter.end()) {

- LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"

- << *SAIt->second

- << " to vectorize a 1st order recurrence.\n");

- SinkAfterInverse[SAIt->second] = Instr;

- continue;

- }

- Ingredients.push_back(Instr);

- // Move instructions to handle first-order recurrences, step 2: push the

- // instruction to be sunk at its insertion point.

- auto SAInvIt = SinkAfterInverse.find(Instr);

- if (SAInvIt != SinkAfterInverse.end())

- Ingredients.push_back(SAInvIt->second);

- }

- // Introduce each ingredient into VPlan.

- for (Instruction *Instr : Ingredients) {

if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))

continue;

@@ -7199,6 +7232,33 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBlockUtils::disconnectBlocks(PreEntry, Entry);

delete PreEntry;

+ // ---------------------------------------------------------------------------

+ // Transform initial VPlan: Apply previously taken decisions, in order, to

+ // bring the VPlan to its final state.

+ // ---------------------------------------------------------------------------

+ // Apply Sink-After legal constraints.

+ for (auto &Entry : SinkAfter) {

+ VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);

+ VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);

+ Sink->moveAfter(Target);

+ }

+ // Interleave memory: for each Interleave Group we marked earlier as relevant

+ // for this VPlan, replace the Recipes widening its memory instructions with a

+ // single VPInterleaveRecipe at its insertion point.

+ for (auto IG : InterleaveGroups) {

+ auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(

+ RecipeBuilder.getRecipe(IG->getInsertPos()));

+ (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))

+ ->insertBefore(Recipe);

+ for (unsigned i = 0; i < IG->getFactor(); ++i)

+ if (Instruction *Member = IG->getMember(i)) {

+ RecipeBuilder.getRecipe(Member)->eraseFromParent();

+ }

// Finally, if tail is folded by masking, introduce selects between the phi

// and the live-out instruction of each reduction, at the end of the latch.

if (CM.foldTailByMasking()) {

@@ -7255,9 +7315,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

}

SmallPtrSet<Instruction *, 1> DeadInstructions;

- VPlanHCFGTransforms::VPInstructionsToVPRecipes(

- Plan, Legal->getInductionVars(), DeadInstructions);

+ VPlanTransforms::VPInstructionsToVPRecipes(

+ OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);

return Plan;

}

@@ -7266,13 +7325,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) {

return ILV.getOrCreateVectorValue(V, Part);

}

+Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(

+ Value *V, const VPIteration &Instance) {

+ return ILV.getOrCreateScalarValue(V, Instance);

void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {

O << " +\n"

<< Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";

IG->getInsertPos()->printAsOperand(O, false);

- if (User) {

+ O << ", ";

+ getAddr()->printAsOperand(O);

+ VPValue *Mask = getMask();

+ if (Mask) {

O << ", ";

- User->getOperand(0)->printAsOperand(O);

+ Mask->printAsOperand(O);

}

O << "\\l\"";

for (unsigned i = 0; i < IG->getFactor(); ++i)

@@ -7286,6 +7353,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {

State.ILV->widenInstruction(Instr);

}

+void VPWidenGEPRecipe::execute(VPTransformState &State) {

+ State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,

+ IsIndexLoopInvariant);

void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Int or FP induction being replicated.");

State.ILV->widenIntOrFpInduction(IV, Trunc);

@@ -7336,15 +7408,8 @@ void VPBlendRecipe::execute(VPTransformState &State) {

void VPInterleaveRecipe::execute(VPTransformState &State) {

assert(!State.Instance && "Interleave group being replicated.");

- if (!User)

- return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());

- // Last (and currently only) operand is a mask.

- InnerLoopVectorizer::VectorParts MaskValues(State.UF);

- VPValue *Mask = User->getOperand(User->getNumOperands() - 1);

- for (unsigned Part = 0; Part < State.UF; ++Part)

- MaskValues[Part] = State.get(Mask, Part);

- State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);

+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),

+ getMask());

}

void VPReplicateRecipe::execute(VPTransformState &State) {

@@ -7431,29 +7496,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {

}

void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

- if (!User)

- return State.ILV->vectorizeMemoryInstruction(&Instr);

- // Last (and currently only) operand is a mask.

- InnerLoopVectorizer::VectorParts MaskValues(State.UF);

- VPValue *Mask = User->getOperand(User->getNumOperands() - 1);

- for (unsigned Part = 0; Part < State.UF; ++Part)

- MaskValues[Part] = State.get(Mask, Part);

- State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);

-static ScalarEpilogueLowering

-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,

- ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {

- ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;

- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

- (F->hasOptSize() ||

- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))

- SEL = CM_ScalarEpilogueNotAllowedOptSize;

- else if (PreferPredicateOverEpilog || Hints.getPredicate())

- SEL = CM_ScalarEpilogueNotNeededUsePredicate;

- return SEL;

+ State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());

+// Determine how to lower the scalar epilogue, which depends on 1) optimising

+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing

+// predication, and 4) a TTI hook that analyses whether the loop is suitable

+// for predication.

+static ScalarEpilogueLowering getScalarEpilogueLowering(

+ Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,

+ BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,

+ AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,

+ LoopVectorizationLegality &LVL) {

+ bool OptSize =

+ F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,

+ PGSOQueryType::IRPass);

+ // 1) OptSize takes precedence over all other options, i.e. if this is set,

+ // don't look at hints or options, and don't request a scalar epilogue.

+ if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)

+ return CM_ScalarEpilogueNotAllowedOptSize;

+ bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&

+ !PreferPredicateOverEpilog;

+ // 2) Next, if disabling predication is requested on the command line, honour

+ // this and request a scalar epilogue. Also do this if we don't have a

+ // primary induction variable, which is required for predication.

+ if (PredicateOptDisabled || !LVL.getPrimaryInduction())

+ return CM_ScalarEpilogueAllowed;

+ // 3) and 4) look if enabling predication is requested on the command line,

+ // with a loop hint, or if the TTI hook indicates this is profitable, request

+ // predication .

+ if (PreferPredicateOverEpilog ||

+ Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||

+ (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,

+ LVL.getLAI()) &&

+ Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))

+ return CM_ScalarEpilogueNotNeededUsePredicate;

+ return CM_ScalarEpilogueAllowed;

}

// Process the loop in the VPlan-native vectorization path. This path builds

@@ -7470,14 +7552,16 @@ static bool processLoopInVPlanNativePath(

assert(EnableVPlanNativePath && "VPlan-native path is disabled.");

Function *F = L->getHeader()->getParent();

InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,

&Hints, IAI);

// Use the planner for outer loop vectorization.

// TODO: CM is not used at this point inside the planner. Turn CM into an

// optional argument if we don't need it in the future.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);

+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);

// Get user vectorization factor.

const unsigned UserVF = Hints.getWidth();

@@ -7562,7 +7646,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check the function attributes and profiles to find out if this function

// should be optimized for size.

- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(

+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);

// Entrance to the VPlan-native vectorization path. Outer loops are processed

// here. They may require CFG and instruction level transformations before

@@ -7635,7 +7720,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

CM.collectValuesToIgnore();

// Use the planner for vectorization.

- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);

+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);

// Get user vectorization factor.

unsigned UserVF = Hints.getWidth();