src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-10-23 17:51:42 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-10-23 17:51:42 +0000
commit	1d5ae1026e831016fc29fd927877c86af904481f (patch)
tree	2cdfd12620fcfa5d9e4a0389f85368e8e36f63f9 /lib/Transforms/Vectorize
parent	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (diff)

vendor/llvm/llvm-trunk-r375505 vendor/llvm

Notes

Diffstat (limited to 'lib/Transforms/Vectorize')

-rw-r--r--

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

186

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorizationPlanner.h

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorize.cpp

738

-rw-r--r--

lib/Transforms/Vectorize/SLPVectorizer.cpp

820

-rw-r--r--

lib/Transforms/Vectorize/VPlan.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlan.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlanSLP.cpp

9 files changed, 1143 insertions, 669 deletions

diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 4273080ddd91..f44976c723ec 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

@@ -147,7 +147,7 @@ private:

static const unsigned MaxDepth = 3;

bool isConsecutiveAccess(Value *A, Value *B);

- bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,

+ bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,

unsigned Depth = 0) const;

bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,

unsigned Depth) const;

@@ -336,14 +336,29 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {

}

bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,

- const APInt &PtrDelta,

- unsigned Depth) const {

+ APInt PtrDelta, unsigned Depth) const {

unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());

APInt OffsetA(PtrBitWidth, 0);

APInt OffsetB(PtrBitWidth, 0);

PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);

PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);

+ unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());

+ if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))

+ return false;

+ // In case if we have to shrink the pointer

+ // stripAndAccumulateInBoundsConstantOffsets should properly handle a

+ // possible overflow and the value should fit into a smallest data type

+ // used in the cast/gep chain.

+ assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&

+ OffsetB.getMinSignedBits() <= NewPtrBitWidth);

+ OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);

+ OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);

+ PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);

APInt OffsetDelta = OffsetB - OffsetA;

// Check if they are based on the same pointer. That makes the offsets

@@ -650,7 +665,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {

// We can ignore the alias if the we have a load store pair and the load

// is known to be invariant. The load cannot be clobbered by the store.

auto IsInvariantLoad = [](const LoadInst *LI) -> bool {

- return LI->getMetadata(LLVMContext::MD_invariant_load);

+ return LI->hasMetadata(LLVMContext::MD_invariant_load);

};

// We can ignore the alias as long as the load comes before the store,

@@ -1077,7 +1092,7 @@ bool Vectorizer::vectorizeLoadChain(

LoadInst *L0 = cast<LoadInst>(Chain[0]);

// If the vector has an int element, default to int for the whole load.

- Type *LoadTy;

+ Type *LoadTy = nullptr;

for (const auto &V : Chain) {

LoadTy = cast<LoadInst>(V)->getType();

if (LoadTy->isIntOrIntVectorTy())

@@ -1089,6 +1104,7 @@ bool Vectorizer::vectorizeLoadChain(

break;

}

+ assert(LoadTy && "Can't determine LoadInst type from chain");

unsigned Sz = DL.getTypeSizeInBits(LoadTy);

unsigned AS = L0->getPointerAddressSpace();

diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6ef8dc2d3cd7..f43842be5357 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -13,7 +13,10 @@

// pass. It should be easy to create an analysis pass around it if there

// is a need (but D45420 needs to happen first).

+#include "llvm/Transforms/Vectorize/LoopVectorize.h"

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

+#include "llvm/Analysis/Loads.h"

+#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/IntrinsicInst.h"

@@ -47,38 +50,6 @@ static const unsigned MaxInterleaveFactor = 16;

namespace llvm {

-#ifndef NDEBUG

-static void debugVectorizationFailure(const StringRef DebugMsg,

- Instruction *I) {

- dbgs() << "LV: Not vectorizing: " << DebugMsg;

- if (I != nullptr)

- dbgs() << " " << *I;

- else

- dbgs() << '.';

- dbgs() << '\n';

-#endif

-OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,

- StringRef RemarkName,

- Loop *TheLoop,

- Instruction *I) {

- Value *CodeRegion = TheLoop->getHeader();

- DebugLoc DL = TheLoop->getStartLoc();

- if (I) {

- CodeRegion = I->getParent();

- // If there is no debug location attached to the instruction, revert back to

- // using the loop's.

- if (I->getDebugLoc())

- DL = I->getDebugLoc();

- }

- OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);

- R << "loop not vectorized: ";

- return R;

bool LoopVectorizeHints::Hint::validate(unsigned Val) {

switch (Kind) {

case HK_WIDTH:

@@ -88,6 +59,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {

case HK_FORCE:

return (Val <= 1);

case HK_ISVECTORIZED:

+ case HK_PREDICATE:

return (Val == 0 || Val == 1);

}

return false;

@@ -99,7 +71,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,

: Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),

Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),

Force("vectorize.enable", FK_Undefined, HK_FORCE),

- IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {

+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED),

+ Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),

+ ORE(ORE) {

// Populate values with existing loop metadata.

getHintsFromMetadata();

@@ -250,7 +224,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {

return;

unsigned Val = C->getZExtValue();

- Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};

+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};

for (auto H : Hints) {

if (Name == H->Name) {

if (H->validate(Val))

@@ -435,7 +409,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {

const ValueToValueMap &Strides =

getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();

- int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);

+ bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();

+ int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);

if (Stride == 1 || Stride == -1)

return Stride;

return 0;

@@ -445,14 +420,6 @@ bool LoopVectorizationLegality::isUniform(Value *V) {

return LAI->isUniform(V);

}

-void LoopVectorizationLegality::reportVectorizationFailure(

- const StringRef DebugMsg, const StringRef OREMsg,

- const StringRef ORETag, Instruction *I) const {

- LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));

- ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),

- ORETag, TheLoop, I) << OREMsg);

bool LoopVectorizationLegality::canVectorizeOuterLoop() {

assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");

// Store the result and return it at the end instead of exiting early, in case

@@ -467,7 +434,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

if (!Br) {

reportVectorizationFailure("Unsupported basic block terminator",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -486,7 +453,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

!LI->isLoopHeader(Br->getSuccessor(1))) {

reportVectorizationFailure("Unsupported conditional branch",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -500,7 +467,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

TheLoop /*context outer loop*/)) {

reportVectorizationFailure("Outer loop contains divergent loops",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -511,7 +478,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

if (!setupOuterLoopInductions()) {

reportVectorizationFailure("Unsupported outer loop Phi(s)",

"Unsupported outer loop Phi(s)",

- "UnsupportedPhi");

+ "UnsupportedPhi", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -618,7 +585,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

!PhiTy->isPointerTy()) {

reportVectorizationFailure("Found a non-int non-pointer PHI",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

return false;

}

@@ -631,6 +598,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// Unsafe cyclic dependencies with header phis are identified during

// legalization for reduction, induction and first order

// recurrences.

+ AllowedExit.insert(&I);

continue;

}

@@ -638,7 +606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

if (Phi->getNumIncomingValues() != 2) {

reportVectorizationFailure("Found an invalid PHI",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood", Phi);

+ "CFGNotUnderstood", ORE, TheLoop, Phi);

return false;

}

@@ -690,7 +658,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

reportVectorizationFailure("Found an unidentified PHI",

"value that could not be identified as "

"reduction is used outside the loop",

- "NonReductionValueUsedOutsideLoop", Phi);

+ "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);

return false;

} // end of PHI handling

@@ -721,11 +689,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

"library call cannot be vectorized. "

"Try compiling with -fno-math-errno, -ffast-math, "

"or similar flags",

- "CantVectorizeLibcall", CI);

+ "CantVectorizeLibcall", ORE, TheLoop, CI);

} else {

reportVectorizationFailure("Found a non-intrinsic callsite",

"call instruction cannot be vectorized",

- "CantVectorizeLibcall", CI);

+ "CantVectorizeLibcall", ORE, TheLoop, CI);

}

return false;

}

@@ -740,7 +708,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {

reportVectorizationFailure("Found unvectorizable intrinsic",

"intrinsic instruction cannot be vectorized",

- "CantVectorizeIntrinsic", CI);

+ "CantVectorizeIntrinsic", ORE, TheLoop, CI);

return false;

}

@@ -753,7 +721,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

isa<ExtractElementInst>(I)) {

reportVectorizationFailure("Found unvectorizable type",

"instruction return type cannot be vectorized",

- "CantVectorizeInstructionReturnType", &I);

+ "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);

return false;

}

@@ -763,7 +731,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

if (!VectorType::isValidElementType(T)) {

reportVectorizationFailure("Store instruction cannot be vectorized",

"store instruction cannot be vectorized",

- "CantVectorizeStore", ST);

+ "CantVectorizeStore", ORE, TheLoop, ST);

return false;

}

@@ -773,12 +741,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// Arbitrarily try a vector of 2 elements.

Type *VecTy = VectorType::get(T, /*NumElements=*/2);

assert(VecTy && "did not find vectorized version of stored type");

- unsigned Alignment = getLoadStoreAlignment(ST);

- if (!TTI->isLegalNTStore(VecTy, Alignment)) {

+ const MaybeAlign Alignment = getLoadStoreAlignment(ST);

+ assert(Alignment && "Alignment should be set");

+ if (!TTI->isLegalNTStore(VecTy, *Alignment)) {

reportVectorizationFailure(

"nontemporal store instruction cannot be vectorized",

- "CantVectorizeNontemporalStore", ST);

+ "CantVectorizeNontemporalStore", ORE, TheLoop, ST);

return false;

}

@@ -789,12 +758,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// supported on the target (arbitrarily try a vector of 2 elements).

Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);

assert(VecTy && "did not find vectorized version of load type");

- unsigned Alignment = getLoadStoreAlignment(LD);

- if (!TTI->isLegalNTLoad(VecTy, Alignment)) {

+ const MaybeAlign Alignment = getLoadStoreAlignment(LD);

+ assert(Alignment && "Alignment should be set");

+ if (!TTI->isLegalNTLoad(VecTy, *Alignment)) {

reportVectorizationFailure(

"nontemporal load instruction cannot be vectorized",

- "CantVectorizeNontemporalLoad", LD);

+ "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);

return false;

}

@@ -823,7 +793,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

}

reportVectorizationFailure("Value cannot be used outside the loop",

"value cannot be used outside the loop",

- "ValueUsedOutsideLoop", &I);

+ "ValueUsedOutsideLoop", ORE, TheLoop, &I);

return false;

}

} // next instr.

@@ -833,12 +803,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

if (Inductions.empty()) {

reportVectorizationFailure("Did not find one integer induction var",

"loop induction variable could not be identified",

- "NoInductionVariable");

+ "NoInductionVariable", ORE, TheLoop);

return false;

} else if (!WidestIndTy) {

reportVectorizationFailure("Did not find one integer induction var",

"integer loop induction variable could not be identified",

- "NoIntegerInductionVariable");

+ "NoIntegerInductionVariable", ORE, TheLoop);

return false;

} else {

LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");

@@ -869,7 +839,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {

if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {

reportVectorizationFailure("Stores to a uniform address",

"write to a loop invariant address could not be vectorized",

- "CantVectorizeStoreToLoopInvariantAddress");

+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);

return false;

}

Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());

@@ -905,7 +875,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {

}

bool LoopVectorizationLegality::blockCanBePredicated(

- BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {

+ BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {

const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();

for (Instruction &I : *BB) {

@@ -924,7 +894,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(

// !llvm.mem.parallel_loop_access implies if-conversion safety.

// Otherwise, record that the load needs (real or emulated) masking

// and let the cost model decide.

- if (!IsAnnotatedParallel)

+ if (!IsAnnotatedParallel || PreserveGuards)

MaskedOp.insert(LI);

continue;

}

@@ -953,23 +923,41 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

if (!EnableIfConversion) {

reportVectorizationFailure("If-conversion is disabled",

"if-conversion is disabled",

- "IfConversionDisabled");

+ "IfConversionDisabled",

+ ORE, TheLoop);

return false;

}

assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");

- // A list of pointers that we can safely read and write to.

+ // A list of pointers which are known to be dereferenceable within scope of

+ // the loop body for each iteration of the loop which executes. That is,

+ // the memory pointed to can be dereferenced (with the access size implied by

+ // the value's type) unconditionally within the loop header without

+ // introducing a new fault.

SmallPtrSet<Value *, 8> SafePointes;

// Collect safe addresses.

for (BasicBlock *BB : TheLoop->blocks()) {

- if (blockNeedsPredication(BB))

+ if (!blockNeedsPredication(BB)) {

+ for (Instruction &I : *BB)

+ if (auto *Ptr = getLoadStorePointerOperand(&I))

+ SafePointes.insert(Ptr);

continue;

+ }

- for (Instruction &I : *BB)

- if (auto *Ptr = getLoadStorePointerOperand(&I))

- SafePointes.insert(Ptr);

+ // For a block which requires predication, a address may be safe to access

+ // in the loop w/o predication if we can prove dereferenceability facts

+ // sufficient to ensure it'll never fault within the loop. For the moment,

+ // we restrict this to loads; stores are more complicated due to

+ // concurrency restrictions.

+ ScalarEvolution &SE = *PSE.getSE();

+ for (Instruction &I : *BB) {

+ LoadInst *LI = dyn_cast<LoadInst>(&I);

+ if (LI && !mustSuppressSpeculation(*LI) &&

+ isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))

+ SafePointes.insert(LI->getPointerOperand());

+ }

}

// Collect the blocks that need predication.

@@ -979,7 +967,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

if (!isa<BranchInst>(BB->getTerminator())) {

reportVectorizationFailure("Loop contains a switch statement",

"loop contains a switch statement",

- "LoopContainsSwitch", BB->getTerminator());

+ "LoopContainsSwitch", ORE, TheLoop,

+ BB->getTerminator());

return false;

}

@@ -989,14 +978,16 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

reportVectorizationFailure(

"Control flow cannot be substituted for a select",

"control flow cannot be substituted for a select",

- "NoCFGForSelect", BB->getTerminator());

+ "NoCFGForSelect", ORE, TheLoop,

+ BB->getTerminator());

return false;

}

} else if (BB != Header && !canIfConvertPHINodes(BB)) {

reportVectorizationFailure(

"Control flow cannot be substituted for a select",

"control flow cannot be substituted for a select",

- "NoCFGForSelect", BB->getTerminator());

+ "NoCFGForSelect", ORE, TheLoop,

+ BB->getTerminator());

return false;

}

@@ -1026,7 +1017,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

if (!Lp->getLoopPreheader()) {

reportVectorizationFailure("Loop doesn't have a legal pre-header",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -1037,7 +1028,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

if (Lp->getNumBackEdges() != 1) {

reportVectorizationFailure("The loop must have a single backedge",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -1048,7 +1039,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

if (!Lp->getExitingBlock()) {

reportVectorizationFailure("The loop must have an exiting block",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -1061,7 +1052,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

if (Lp->getExitingBlock() != Lp->getLoopLatch()) {

reportVectorizationFailure("The exiting block is not the loop latch",

"loop control flow is not understood by vectorizer",

- "CFGNotUnderstood");

+ "CFGNotUnderstood", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -1124,7 +1115,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {

if (!canVectorizeOuterLoop()) {

reportVectorizationFailure("Unsupported outer loop",

"unsupported outer loop",

- "UnsupportedOuterLoop");

+ "UnsupportedOuterLoop",

+ ORE, TheLoop);

// TODO: Implement DoExtraAnalysis when subsequent legal checks support

// outer loops.

return false;

@@ -1176,7 +1168,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {

if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {

reportVectorizationFailure("Too many SCEV checks needed",

"Too many SCEV assumptions need to be made and checked at runtime",

- "TooManySCEVRunTimeChecks");

+ "TooManySCEVRunTimeChecks", ORE, TheLoop);

if (DoExtraAnalysis)

Result = false;

else

@@ -1190,7 +1182,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {

return Result;

}

-bool LoopVectorizationLegality::canFoldTailByMasking() {

+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {

LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");

@@ -1199,22 +1191,21 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {

"No primary induction, cannot fold tail by masking",

"Missing a primary induction variable in the loop, which is "

"needed in order to fold tail by masking as required.",

- "NoPrimaryInduction");

+ "NoPrimaryInduction", ORE, TheLoop);

return false;

}

- // TODO: handle reductions when tail is folded by masking.

- if (!Reductions.empty()) {

- reportVectorizationFailure(

- "Loop has reductions, cannot fold tail by masking",

- "Cannot fold tail by masking in the presence of reductions.",

- "ReductionFoldingTailByMasking");

- return false;

- }

+ SmallPtrSet<const Value *, 8> ReductionLiveOuts;

- // TODO: handle outside users when tail is folded by masking.

+ for (auto &Reduction : *getReductionVars())

+ ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());

+ // TODO: handle non-reduction outside users when tail is folded by masking.

for (auto *AE : AllowedExit) {

- // Check that all users of allowed exit values are inside the loop.

+ // Check that all users of allowed exit values are inside the loop or

+ // are the live-out of a reduction.

+ if (ReductionLiveOuts.count(AE))

+ continue;

for (User *U : AE->users()) {

Instruction *UI = cast<Instruction>(U);

if (TheLoop->contains(UI))

@@ -1222,7 +1213,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {

reportVectorizationFailure(

"Cannot fold tail by masking, loop has an outside user for",

"Cannot fold tail by masking in the presence of live outs.",

- "LiveOutFoldingTailByMasking", UI);

+ "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);

return false;

}

@@ -1233,11 +1224,12 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {

// Check and mark all blocks for predication, including those that ordinarily

// do not need predication such as the header block.

for (BasicBlock *BB : TheLoop->blocks()) {

- if (!blockCanBePredicated(BB, SafePointers)) {

+ if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {

reportVectorizationFailure(

"Cannot fold tail by masking as required",

"control flow cannot be substituted for a select",

- "NoCFGForSelect", BB->getTerminator());

+ "NoCFGForSelect", ORE, TheLoop,

+ BB->getTerminator());

return false;

}

diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 97077cce83e3..a5e85f27fabf 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -228,11 +228,11 @@ public:

/// Plan how to best vectorize, return the best VF and its cost, or None if

/// vectorization and interleaving should be avoided up front.

- Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF);

+ Optional<VectorizationFactor> plan(unsigned UserVF);

/// Use the VPlan-native path to plan how to best vectorize, return the best

/// VF and its cost.

- VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);

+ VectorizationFactor planInVPlanNativePath(unsigned UserVF);

/// Finalize the best decision and dispose of all other VPlans.

void setBestPlan(unsigned VF, unsigned UF);

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 46265e3f3e13..8f0bf70f873c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold(

"value are vectorized only if no scalar iteration overheads "

"are incurred."));

+// Indicates that an epilogue is undesired, predication is preferred.

+// This means that the vectorizer will try to fold the loop-tail (epilogue)

+// into the loop and predicate the loop body accordingly.

+static cl::opt<bool> PreferPredicateOverEpilog(

+ "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,

+ cl::desc("Indicate that an epilogue is undesired, predication should be "

+ "used instead."));

static cl::opt<bool> MaximizeBandwidth(

"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,

cl::desc("Maximize bandwidth when selecting vectorization factor which "

@@ -347,6 +355,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {

: ConstantFP::get(Ty, C);

}

+/// Returns "best known" trip count for the specified loop \p L as defined by

+/// the following procedure:

+/// 1) Returns exact trip count if it is known.

+/// 2) Returns expected trip count according to profile data if any.

+/// 3) Returns upper bound estimate if it is known.

+/// 4) Returns None if all of the above failed.

+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {

+ // Check if exact trip count is known.

+ if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))

+ return ExpectedTC;

+ // Check if there is an expected trip count available from profile data.

+ if (LoopVectorizeWithBlockFrequency)

+ if (auto EstimatedTC = getLoopEstimatedTripCount(L))

+ return EstimatedTC;

+ // Check if upper bound estimate is known.

+ if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))

+ return ExpectedTC;

+ return None;

namespace llvm {

/// InnerLoopVectorizer vectorizes loops which contain only one basic

@@ -795,6 +826,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)

B.SetCurrentDebugLocation(DebugLoc());

}

+/// Write a record \p DebugMsg about vectorization failure to the debug

+/// output stream. If \p I is passed, it is an instruction that prevents

+/// vectorization.

+#ifndef NDEBUG

+static void debugVectorizationFailure(const StringRef DebugMsg,

+ Instruction *I) {

+ dbgs() << "LV: Not vectorizing: " << DebugMsg;

+ if (I != nullptr)

+ dbgs() << " " << *I;

+ else

+ dbgs() << '.';

+ dbgs() << '\n';

+#endif

+/// Create an analysis remark that explains why vectorization failed

+///

+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p

+/// RemarkName is the identifier for the remark. If \p I is passed it is an

+/// instruction that prevents vectorization. Otherwise \p TheLoop is used for

+/// the location of the remark. \return the remark object that can be

+/// streamed to.

+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,

+ StringRef RemarkName, Loop *TheLoop, Instruction *I) {

+ Value *CodeRegion = TheLoop->getHeader();

+ DebugLoc DL = TheLoop->getStartLoc();

+ if (I) {

+ CodeRegion = I->getParent();

+ // If there is no debug location attached to the instruction, revert back to

+ // using the loop's.

+ if (I->getDebugLoc())

+ DL = I->getDebugLoc();

+ }

+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);

+ R << "loop not vectorized: ";

+ return R;

+namespace llvm {

+void reportVectorizationFailure(const StringRef DebugMsg,

+ const StringRef OREMsg, const StringRef ORETag,

+ OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {

+ LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));

+ LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);

+ ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),

+ ORETag, TheLoop, I) << OREMsg);

+} // end namespace llvm

#ifndef NDEBUG

/// \return string containing a file name and a line # for the given loop.

static std::string getDebugLocString(const Loop *L) {

@@ -836,6 +920,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,

namespace llvm {

+// Loop vectorization cost-model hints how the scalar epilogue loop should be

+// lowered.

+enum ScalarEpilogueLowering {

+ // The default: allowing scalar epilogues.

+ CM_ScalarEpilogueAllowed,

+ // Vectorization with OptForSize: don't allow epilogues.

+ CM_ScalarEpilogueNotAllowedOptSize,

+ // A special case of vectorisation with OptForSize: loops with a very small

+ // trip count are considered for vectorization under OptForSize, thereby

+ // making sure the cost of their loop body is dominant, free of runtime

+ // guards and scalar iteration overheads.

+ CM_ScalarEpilogueNotAllowedLowTripLoop,

+ // Loop hint predicate indicating an epilogue is undesired.

+ CM_ScalarEpilogueNotNeededUsePredicate

+};

/// LoopVectorizationCostModel - estimates the expected speedups due to

/// vectorization.

/// In many cases vectorization is not profitable. This can happen because of

@@ -845,20 +949,26 @@ namespace llvm {

/// different operations.

class LoopVectorizationCostModel {

public:

- LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,

- LoopInfo *LI, LoopVectorizationLegality *Legal,

+ LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,

+ PredicatedScalarEvolution &PSE, LoopInfo *LI,

+ LoopVectorizationLegality *Legal,

const TargetTransformInfo &TTI,

const TargetLibraryInfo *TLI, DemandedBits *DB,

AssumptionCache *AC,

OptimizationRemarkEmitter *ORE, const Function *F,

const LoopVectorizeHints *Hints,

InterleavedAccessInfo &IAI)

- : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),

- AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}

+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),

+ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),

+ Hints(Hints), InterleaveInfo(IAI) {}

/// \return An upper bound for the vectorization factor, or None if

/// vectorization and interleaving should be avoided up front.

- Optional<unsigned> computeMaxVF(bool OptForSize);

+ Optional<unsigned> computeMaxVF();

+ /// \return True if runtime checks are required for vectorization, and false

+ /// otherwise.

+ bool runtimeChecksRequired();

/// \return The most profitable vectorization factor and the cost of that VF.

/// This method checks every power of two up to MaxVF. If UserVF is not ZERO

@@ -881,8 +991,7 @@ public:

/// If interleave count has been specified by metadata it will be returned.

/// Otherwise, the interleave count is computed and returned. VF and LoopCost

/// are the selected vectorization factor and the cost of the selected VF.

- unsigned selectInterleaveCount(bool OptForSize, unsigned VF,

- unsigned LoopCost);

+ unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);

/// Memory access instruction may be vectorized in more than one way.

/// Form of instruction after vectorization depends on cost.

@@ -897,10 +1006,11 @@ public:

/// of a loop.

struct RegisterUsage {

/// Holds the number of loop invariant values that are used in the loop.

- unsigned LoopInvariantRegs;

+ /// The key is ClassID of target-provided register class.

+ SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;

/// Holds the maximum number of concurrent live intervals in the loop.

- unsigned MaxLocalUsers;

+ /// The key is ClassID of target-provided register class.

+ SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;

};

/// \return Returns information about the register usages of the loop for the

@@ -1080,14 +1190,16 @@ public:

/// Returns true if the target machine supports masked store operation

/// for the given \p DataType and kind of access to \p Ptr.

- bool isLegalMaskedStore(Type *DataType, Value *Ptr) {

- return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);

+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {

+ return Legal->isConsecutivePtr(Ptr) &&

+ TTI.isLegalMaskedStore(DataType, Alignment);

}

/// Returns true if the target machine supports masked load operation

/// for the given \p DataType and kind of access to \p Ptr.

- bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {

- return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);

+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {

+ return Legal->isConsecutivePtr(Ptr) &&

+ TTI.isLegalMaskedLoad(DataType, Alignment);

}

/// Returns true if the target machine supports masked scatter operation

@@ -1157,11 +1269,14 @@ public:

/// to handle accesses with gaps, and there is nothing preventing us from

/// creating a scalar epilogue.

bool requiresScalarEpilogue() const {

- return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();

+ return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();

}

- /// Returns true if a scalar epilogue is not allowed due to optsize.

- bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }

+ /// Returns true if a scalar epilogue is not allowed due to optsize or a

+ /// loop hint annotation.

+ bool isScalarEpilogueAllowed() const {

+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;

+ }

/// Returns true if all loop blocks should be masked to fold tail loop.

bool foldTailByMasking() const { return FoldTailByMasking; }

@@ -1187,7 +1302,7 @@ private:

/// \return An upper bound for the vectorization factor, larger than zero.

/// One is returned if vectorization should best be avoided due to cost.

- unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);

+ unsigned computeFeasibleMaxVF(unsigned ConstTripCount);

/// The vectorization cost is a combination of the cost itself and a boolean

/// indicating whether any of the contributing operations will actually

@@ -1246,15 +1361,6 @@ private:

/// should be used.

bool useEmulatedMaskMemRefHack(Instruction *I);

- /// Create an analysis remark that explains why vectorization failed

- ///

- /// \p RemarkName is the identifier for the remark. \return the remark object

- /// that can be streamed to.

- OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {

- return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),

- RemarkName, TheLoop);

- }

/// Map of scalar integer values to the smallest bitwidth they can be legally

/// represented as. The vector equivalents of these values should be truncated

/// to this type.

@@ -1270,13 +1376,13 @@ private:

SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

/// Records whether it is allowed to have the original scalar loop execute at

- /// least once. This may be needed as a fallback loop in case runtime

+ /// least once. This may be needed as a fallback loop in case runtime

/// aliasing/dependence checks fail, or to handle the tail/remainder

/// iterations when the trip count is unknown or doesn't divide by the VF,

/// or as a peel-loop to handle gaps in interleave-groups.

/// Under optsize and when the trip count is very small we don't allow any

/// iterations to execute in the scalar loop.

- bool IsScalarEpilogueAllowed = true;

+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;

/// All blocks of loop are to be masked to fold tail of scalar iterations.

bool FoldTailByMasking = false;

@@ -1496,7 +1602,7 @@ struct LoopVectorize : public FunctionPass {

auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();

auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

- auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;

+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;

auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();

@@ -2253,12 +2359,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

Type *ScalarDataTy = getMemInstValueType(Instr);

Type *DataTy = VectorType::get(ScalarDataTy, VF);

Value *Ptr = getLoadStorePointerOperand(Instr);

- unsigned Alignment = getLoadStoreAlignment(Instr);

// An alignment of 0 means target abi alignment. We need to use the scalar's

// target abi alignment in such a case.

const DataLayout &DL = Instr->getModule()->getDataLayout();

- if (!Alignment)

- Alignment = DL.getABITypeAlignment(ScalarDataTy);

+ const Align Alignment =

+ DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);

unsigned AddressSpace = getLoadStoreAddressSpace(Instr);

// Determine if the pointer operand of the access is either consecutive or

@@ -2322,8 +2427,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

if (CreateGatherScatter) {

Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;

Value *VectorGep = getOrCreateVectorValue(Ptr, Part);

- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,

- MaskPart);

+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,

+ Alignment.value(), MaskPart);

} else {

if (Reverse) {

// If we store to reverse consecutive memory locations, then we need

@@ -2334,10 +2439,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

}

auto *VecPtr = CreateVecPtr(Part, Ptr);

if (isMaskRequired)

- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,

- Mask[Part]);

+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,

+ Alignment.value(), Mask[Part]);

else

- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);

+ NewSI =

+ Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());

}

addMetadata(NewSI, SI);

}

@@ -2352,18 +2458,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

if (CreateGatherScatter) {

Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;

Value *VectorGep = getOrCreateVectorValue(Ptr, Part);

- NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,

+ NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,

nullptr, "wide.masked.gather");

addMetadata(NewLI, LI);

} else {

auto *VecPtr = CreateVecPtr(Part, Ptr);

if (isMaskRequired)

- NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],

+ NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],

UndefValue::get(DataTy),

"wide.masked.load");

else

- NewLI =

- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");

+ NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),

+ "wide.load");

// Add metadata to the load, but setVectorValue to the reverse shuffle.

addMetadata(NewLI, LI);

@@ -2615,8 +2721,9 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {

if (C->isZero())

return;

- assert(!Cost->foldTailByMasking() &&

- "Cannot SCEV check stride or overflow when folding tail");

+ assert(!BB->getParent()->hasOptSize() &&

+ "Cannot SCEV check stride or overflow when optimizing for size");

// Create a new block containing the stride check.

BB->setName("vector.scevcheck");

auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

@@ -2649,7 +2756,20 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

if (!MemRuntimeCheck)

return;

- assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");

+ if (BB->getParent()->hasOptSize()) {

+ assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&

+ "Cannot emit memory checks when optimizing for size, unless forced "

+ "to vectorize.");

+ ORE->emit([&]() {

+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",

+ L->getStartLoc(), L->getHeader())

+ << "Code-size may be reduced by not forcing "

+ "vectorization, or by source-code modifications "

+ "eliminating the need for runtime checks "

+ "(e.g., adding 'restrict').";

+ });

+ }

// Create a new block containing the memory check.

BB->setName("vector.memcheck");

auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");

@@ -2666,7 +2786,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {

// We currently don't use LoopVersioning for the actual loop cloning but we

// still use it to add the noalias metadata.

- LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,

+ LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,

PSE.getSE());

LVer->prepareNoAliasMetadata();

}

@@ -3598,6 +3718,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

setDebugLocFromInst(Builder, LoopExitInst);

+ // If tail is folded by masking, the vector value to leave the loop should be

+ // a Select choosing between the vectorized LoopExitInst and vectorized Phi,

+ // instead of the former.

+ if (Cost->foldTailByMasking()) {

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *VecLoopExitInst =

+ VectorLoopValueMap.getVectorValue(LoopExitInst, Part);

+ Value *Sel = nullptr;

+ for (User *U : VecLoopExitInst->users()) {

+ if (isa<SelectInst>(U)) {

+ assert(!Sel && "Reduction exit feeding two selects");

+ Sel = U;

+ } else

+ assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");

+ }

+ assert(Sel && "Reduction exit feeds no select");

+ VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);

+ }

// If the vector reduction can be performed in a smaller type, we truncate

// then extend the loop exit value to enable InstCombine to evaluate the

// entire expression in the smaller type.

@@ -4064,7 +4204,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

case Instruction::FCmp: {

// Widen compares. Generate vector compares.

bool FCmp = (I.getOpcode() == Instruction::FCmp);

- auto *Cmp = dyn_cast<CmpInst>(&I);

+ auto *Cmp = cast<CmpInst>(&I);

setDebugLocFromInst(Builder, Cmp);

for (unsigned Part = 0; Part < UF; ++Part) {

Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);

@@ -4097,7 +4237,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- auto *CI = dyn_cast<CastInst>(&I);

+ auto *CI = cast<CastInst>(&I);

setDebugLocFromInst(Builder, CI);

/// Vectorize casts.

@@ -4421,9 +4561,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne

"Widening decision should be ready at this moment");

return WideningDecision == CM_Scalarize;

}

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

return isa<LoadInst>(I) ?

- !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))

- : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));

+ !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))

+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));

}

case Instruction::UDiv:

case Instruction::SDiv:

@@ -4452,10 +4593,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,

// Check if masking is required.

// A Group may need masking for one of two reasons: it resides in a block that

// needs predication, or it was decided to use masking to deal with gaps.

- bool PredicatedAccessRequiresMasking =

+ bool PredicatedAccessRequiresMasking =

Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);

- bool AccessWithGapsRequiresMasking =

- Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;

+ bool AccessWithGapsRequiresMasking =

+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();

if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)

return true;

@@ -4466,8 +4607,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,

"Masked interleave-groups for predicated accesses are not enabled.");

auto *Ty = getMemInstValueType(I);

- return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)

- : TTI.isLegalMaskedStore(Ty);

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)

+ : TTI.isLegalMaskedStore(Ty, Alignment);

}

bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,

@@ -4675,82 +4817,96 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

Uniforms[VF].insert(Worklist.begin(), Worklist.end());

}

-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {

- if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {

- // TODO: It may by useful to do since it's still likely to be dynamically

- // uniform if the target can skip.

- LLVM_DEBUG(

- dbgs() << "LV: Not inserting runtime ptr check for divergent target");

- ORE->emit(

- createMissedAnalysis("CantVersionLoopWithDivergentTarget")

- << "runtime pointer checks needed. Not enabled for divergent target");

- return None;

- }

- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

- if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.

- return computeFeasibleMaxVF(OptForSize, TC);

+bool LoopVectorizationCostModel::runtimeChecksRequired() {

+ LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");

if (Legal->getRuntimePointerChecking()->Need) {

- ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")

- << "runtime pointer checks needed. Enable vectorization of this "

- "loop with '#pragma clang loop vectorize(enable)' when "

- "compiling with -Os/-Oz");

- LLVM_DEBUG(

- dbgs()

- << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");

- return None;

+ reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",

+ "runtime pointer checks needed. Enable vectorization of this "

+ "loop with '#pragma clang loop vectorize(enable)' when "

+ "compiling with -Os/-Oz",

+ "CantVersionLoopWithOptForSize", ORE, TheLoop);

+ return true;

}

if (!PSE.getUnionPredicate().getPredicates().empty()) {

- ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")

- << "runtime SCEV checks needed. Enable vectorization of this "

- "loop with '#pragma clang loop vectorize(enable)' when "

- "compiling with -Os/-Oz");

- LLVM_DEBUG(

- dbgs()

- << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");

- return None;

+ reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",

+ "runtime SCEV checks needed. Enable vectorization of this "

+ "loop with '#pragma clang loop vectorize(enable)' when "

+ "compiling with -Os/-Oz",

+ "CantVersionLoopWithOptForSize", ORE, TheLoop);

+ return true;

}

// FIXME: Avoid specializing for stride==1 instead of bailing out.

if (!Legal->getLAI()->getSymbolicStrides().empty()) {

- ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")

- << "runtime stride == 1 checks needed. Enable vectorization of "

- "this loop with '#pragma clang loop vectorize(enable)' when "

- "compiling with -Os/-Oz");

- LLVM_DEBUG(

- dbgs()

- << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");

+ reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",

+ "runtime stride == 1 checks needed. Enable vectorization of "

+ "this loop with '#pragma clang loop vectorize(enable)' when "

+ "compiling with -Os/-Oz",

+ "CantVersionLoopWithOptForSize", ORE, TheLoop);

+ return true;

+ }

+ return false;

+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {

+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {

+ // TODO: It may by useful to do since it's still likely to be dynamically

+ // uniform if the target can skip.

+ reportVectorizationFailure(

+ "Not inserting runtime ptr check for divergent target",

+ "runtime pointer checks needed. Not enabled for divergent target",

+ "CantVersionLoopWithDivergentTarget", ORE, TheLoop);

return None;

}

- // If we optimize the program for size, avoid creating the tail loop.

+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

if (TC == 1) {

- ORE->emit(createMissedAnalysis("SingleIterationLoop")

- << "loop trip count is one, irrelevant for vectorization");

- LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");

+ reportVectorizationFailure("Single iteration (non) loop",

+ "loop trip count is one, irrelevant for vectorization",

+ "SingleIterationLoop", ORE, TheLoop);

return None;

}

- // Record that scalar epilogue is not allowed.

- LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");

+ switch (ScalarEpilogueStatus) {

+ case CM_ScalarEpilogueAllowed:

+ return computeFeasibleMaxVF(TC);

+ case CM_ScalarEpilogueNotNeededUsePredicate:

+ LLVM_DEBUG(

+ dbgs() << "LV: vector predicate hint/switch found.\n"

+ << "LV: Not allowing scalar epilogue, creating predicated "

+ << "vector loop.\n");

+ break;

+ case CM_ScalarEpilogueNotAllowedLowTripLoop:

+ // fallthrough as a special case of OptForSize

+ case CM_ScalarEpilogueNotAllowedOptSize:

+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)

+ LLVM_DEBUG(

+ dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");

+ else

+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "

+ << "count.\n");

+ // Bail if runtime checks are required, which are not good when optimising

+ // for size.

+ if (runtimeChecksRequired())

+ return None;

+ break;

+ }

- IsScalarEpilogueAllowed = !OptForSize;

+ // Now try the tail folding

- // We don't create an epilogue when optimizing for size.

// Invalidate interleave groups that require an epilogue if we can't mask

// the interleave-group.

- if (!useMaskedInterleavedAccesses(TTI))

+ if (!useMaskedInterleavedAccesses(TTI))

InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();

- unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);

+ unsigned MaxVF = computeFeasibleMaxVF(TC);

if (TC > 0 && TC % MaxVF == 0) {

+ // Accept MaxVF if we do not have a tail.

LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");

return MaxVF;

}

@@ -4759,28 +4915,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {

// found modulo the vectorization factor is not zero, try to fold the tail

// by masking.

// FIXME: look for a smaller MaxVF that does divide TC rather than masking.

- if (Legal->canFoldTailByMasking()) {

+ if (Legal->prepareToFoldTailByMasking()) {

FoldTailByMasking = true;

return MaxVF;

}

if (TC == 0) {

- ORE->emit(

- createMissedAnalysis("UnknownLoopCountComplexCFG")

- << "unable to calculate the loop count due to complex control flow");

+ reportVectorizationFailure(

+ "Unable to calculate the loop count due to complex control flow",

+ "unable to calculate the loop count due to complex control flow",

+ "UnknownLoopCountComplexCFG", ORE, TheLoop);

return None;

}

- ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")

- << "cannot optimize for size and vectorize at the same time. "

- "Enable vectorization of this loop with '#pragma clang loop "

- "vectorize(enable)' when compiling with -Os/-Oz");

+ reportVectorizationFailure(

+ "Cannot optimize for size and vectorize at the same time.",

+ "cannot optimize for size and vectorize at the same time. "

+ "Enable vectorization of this loop with '#pragma clang loop "

+ "vectorize(enable)' when compiling with -Os/-Oz",

+ "NoTailLoopWithOptForSize", ORE, TheLoop);

return None;

}

unsigned

-LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,

- unsigned ConstTripCount) {

+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {

MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);

unsigned SmallestType, WidestType;

std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();

@@ -4818,8 +4976,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,

}

unsigned MaxVF = MaxVectorSize;

- if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||

- (MaximizeBandwidth && !OptForSize)) {

+ if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||

+ (MaximizeBandwidth && isScalarEpilogueAllowed())) {

// Collect all viable vectorization factors larger than the default MaxVF

// (i.e. MaxVectorSize).

SmallVector<unsigned, 8> VFs;

@@ -4832,9 +4990,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,

// Select the largest VF which doesn't require more registers than existing

// ones.

- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);

for (int i = RUs.size() - 1; i >= 0; --i) {

- if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {

+ bool Selected = true;

+ for (auto& pair : RUs[i].MaxLocalUsers) {

+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);

+ if (pair.second > TargetNumRegisters)

+ Selected = false;

+ }

+ if (Selected) {

MaxVF = VFs[i];

break;

}

@@ -4886,10 +5049,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {

}

if (!EnableCondStoresVectorization && NumPredStores) {

- ORE->emit(createMissedAnalysis("ConditionalStore")

- << "store that is conditionally executed prevents vectorization");

- LLVM_DEBUG(

- dbgs() << "LV: No vectorization. There are conditional stores.\n");

+ reportVectorizationFailure("There are conditional stores.",

+ "store that is conditionally executed prevents vectorization",

+ "ConditionalStore", ORE, TheLoop);

Width = 1;

Cost = ScalarCost;

}

@@ -4958,8 +5120,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {

return {MinWidth, MaxWidth};

}

-unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

- unsigned VF,

+unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,

unsigned LoopCost) {

// -- The interleave heuristics --

// We interleave the loop in order to expose ILP and reduce the loop overhead.

@@ -4975,8 +5136,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

// 3. We don't interleave if we think that we will spill registers to memory

// due to the increased register pressure.

- // When we optimize for size, we don't interleave.

- if (OptForSize)

+ if (!isScalarEpilogueAllowed())

return 1;

// We used the distance for the interleave count.

@@ -4988,22 +5148,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

if (TC > 1 && TC < TinyTripCountInterleaveThreshold)

return 1;

- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);

- LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters

- << " registers\n");

- if (VF == 1) {

- if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)

- TargetNumRegisters = ForceTargetNumScalarRegs;

- } else {

- if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)

- TargetNumRegisters = ForceTargetNumVectorRegs;

- }

RegisterUsage R = calculateRegisterUsage({VF})[0];

// We divide by these constants so assume that we have at least one

// instruction that uses at least one register.

- R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);

+ for (auto& pair : R.MaxLocalUsers) {

+ pair.second = std::max(pair.second, 1U);

+ }

// We calculate the interleave count using the following formula.

// Subtract the number of loop invariants from the number of available

@@ -5016,13 +5166,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

// We also want power of two interleave counts to ensure that the induction

// variable of the vector loop wraps to zero, when tail is folded by masking;

// this currently happens when OptForSize, in which case IC is set to 1 above.

- unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /

- R.MaxLocalUsers);

+ unsigned IC = UINT_MAX;

- // Don't count the induction variable as interleaved.

- if (EnableIndVarRegisterHeur)

- IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /

- std::max(1U, (R.MaxLocalUsers - 1)));

+ for (auto& pair : R.MaxLocalUsers) {

+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);

+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters

+ << " registers of "

+ << TTI.getRegisterClassName(pair.first) << " register class\n");

+ if (VF == 1) {

+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)

+ TargetNumRegisters = ForceTargetNumScalarRegs;

+ } else {

+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)

+ TargetNumRegisters = ForceTargetNumVectorRegs;

+ }

+ unsigned MaxLocalUsers = pair.second;

+ unsigned LoopInvariantRegs = 0;

+ if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())

+ LoopInvariantRegs = R.LoopInvariantRegs[pair.first];

+ unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);

+ // Don't count the induction variable as interleaved.

+ if (EnableIndVarRegisterHeur) {

+ TmpIC =

+ PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /

+ std::max(1U, (MaxLocalUsers - 1)));

+ }

+ IC = std::min(IC, TmpIC);

+ }

// Clamp the interleave ranges to reasonable counts.

unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);

@@ -5036,6 +5208,14 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;

}

+ // If the trip count is constant, limit the interleave count to be less than

+ // the trip count divided by VF.

+ if (TC > 0) {

+ assert(TC >= VF && "VF exceeds trip count?");

+ if ((TC / VF) < MaxInterleaveCount)

+ MaxInterleaveCount = (TC / VF);

+ }

// If we did not calculate the cost for VF (because the user selected the VF)

// then we calculate the cost of VF here.

if (LoopCost == 0)

@@ -5044,7 +5224,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

assert(LoopCost && "Non-zero loop cost expected");

// Clamp the calculated IC to be between the 1 and the max interleave count

- // that the target allows.

+ // that the target and trip count allows.

if (IC > MaxInterleaveCount)

IC = MaxInterleaveCount;

else if (IC < 1)

@@ -5196,7 +5376,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {

const DataLayout &DL = TheFunction->getParent()->getDataLayout();

SmallVector<RegisterUsage, 8> RUs(VFs.size());

- SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);

+ SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());

LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");

@@ -5226,21 +5406,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {

// For each VF find the maximum usage of registers.

for (unsigned j = 0, e = VFs.size(); j < e; ++j) {

+ // Count the number of live intervals.

+ SmallMapVector<unsigned, unsigned, 4> RegUsage;

if (VFs[j] == 1) {

- MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());

- continue;

+ for (auto Inst : OpenIntervals) {

+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());

+ if (RegUsage.find(ClassID) == RegUsage.end())

+ RegUsage[ClassID] = 1;

+ else

+ RegUsage[ClassID] += 1;

+ }

+ } else {

+ collectUniformsAndScalars(VFs[j]);

+ for (auto Inst : OpenIntervals) {

+ // Skip ignored values for VF > 1.

+ if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())

+ continue;

+ if (isScalarAfterVectorization(Inst, VFs[j])) {

+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());

+ if (RegUsage.find(ClassID) == RegUsage.end())

+ RegUsage[ClassID] = 1;

+ else

+ RegUsage[ClassID] += 1;

+ } else {

+ unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());

+ if (RegUsage.find(ClassID) == RegUsage.end())

+ RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);

+ else

+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);

+ }

}

- collectUniformsAndScalars(VFs[j]);

- // Count the number of live intervals.

- unsigned RegUsage = 0;

- for (auto Inst : OpenIntervals) {

- // Skip ignored values for VF > 1.

- if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||

- isScalarAfterVectorization(Inst, VFs[j]))

- continue;

- RegUsage += GetRegUsage(Inst->getType(), VFs[j]);

+ for (auto& pair : RegUsage) {

+ if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())

+ MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);

+ else

+ MaxUsages[j][pair.first] = pair.second;

}

- MaxUsages[j] = std::max(MaxUsages[j], RegUsage);

}

LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "

@@ -5251,18 +5455,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {

}

for (unsigned i = 0, e = VFs.size(); i < e; ++i) {

- unsigned Invariant = 0;

- if (VFs[i] == 1)

- Invariant = LoopInvariants.size();

- else {

- for (auto Inst : LoopInvariants)

- Invariant += GetRegUsage(Inst->getType(), VFs[i]);

+ SmallMapVector<unsigned, unsigned, 4> Invariant;

+ for (auto Inst : LoopInvariants) {

+ unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);

+ unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());

+ if (Invariant.find(ClassID) == Invariant.end())

+ Invariant[ClassID] = Usage;

+ else

+ Invariant[ClassID] += Usage;

}

- LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');

- LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');

- LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant

- << '\n');

+ LLVM_DEBUG({

+ dbgs() << "LV(REG): VF = " << VFs[i] << '\n';

+ dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()

+ << " item\n";

+ for (const auto &pair : MaxUsages[i]) {

+ dbgs() << "LV(REG): RegisterClass: "

+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second

+ << " registers\n";

+ }

+ dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()

+ << " item\n";

+ for (const auto &pair : Invariant) {

+ dbgs() << "LV(REG): RegisterClass: "

+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second

+ << " registers\n";

+ }

+ });

RU.LoopInvariantRegs = Invariant;

RU.MaxLocalUsers = MaxUsages[i];

@@ -5511,7 +5731,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

Type *ValTy = getMemInstValueType(I);

auto SE = PSE.getSE();

- unsigned Alignment = getLoadStoreAlignment(I);

unsigned AS = getLoadStoreAddressSpace(I);

Value *Ptr = getLoadStorePointerOperand(I);

Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

@@ -5525,9 +5744,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

// Don't pass *I here, since it is scalar but will actually be part of a

// vectorized loop where the user of it is a vectorized instruction.

- Cost += VF *

- TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,

- AS);

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

+ Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),

+ Alignment ? Alignment->value() : 0, AS);

// Get the overhead of the extractelement and insertelement instructions

// we might create due to scalarization.

@@ -5552,18 +5771,20 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

unsigned VF) {

Type *ValTy = getMemInstValueType(I);

Type *VectorTy = ToVectorTy(ValTy, VF);

- unsigned Alignment = getLoadStoreAlignment(I);

Value *Ptr = getLoadStorePointerOperand(I);

unsigned AS = getLoadStoreAddressSpace(I);

int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&

"Stride should be 1 or -1 for consecutive memory access");

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

unsigned Cost = 0;

if (Legal->isMaskRequired(I))

- Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,

+ Alignment ? Alignment->value() : 0, AS);

else

- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,

+ Alignment ? Alignment->value() : 0, AS, I);

bool Reverse = ConsecutiveStride < 0;

if (Reverse)

@@ -5575,33 +5796,37 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

unsigned VF) {

Type *ValTy = getMemInstValueType(I);

Type *VectorTy = ToVectorTy(ValTy, VF);

- unsigned Alignment = getLoadStoreAlignment(I);

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

unsigned AS = getLoadStoreAddressSpace(I);

if (isa<LoadInst>(I)) {

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +

+ TTI.getMemoryOpCost(Instruction::Load, ValTy,

+ Alignment ? Alignment->value() : 0, AS) +

TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);

}

StoreInst *SI = cast<StoreInst>(I);

bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +

- (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(

- Instruction::ExtractElement,

- VectorTy, VF - 1));

+ TTI.getMemoryOpCost(Instruction::Store, ValTy,

+ Alignment ? Alignment->value() : 0, AS) +

+ (isLoopInvariantStoreValue

+ ? 0

+ : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,

+ VF - 1));

}

unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,

unsigned VF) {

Type *ValTy = getMemInstValueType(I);

Type *VectorTy = ToVectorTy(ValTy, VF);

- unsigned Alignment = getLoadStoreAlignment(I);

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

Value *Ptr = getLoadStorePointerOperand(I);

return TTI.getAddressComputationCost(VectorTy) +

TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,

- Legal->isMaskRequired(I), Alignment);

+ Legal->isMaskRequired(I),

+ Alignment ? Alignment->value() : 0);

}

unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

@@ -5626,8 +5851,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

}

// Calculate the cost of the whole interleaved group.

- bool UseMaskForGaps =

- Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;

+ bool UseMaskForGaps =

+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();

unsigned Cost = TTI.getInterleavedMemoryOpCost(

I->getOpcode(), WideVecTy, Group->getFactor(), Indices,

Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);

@@ -5648,11 +5873,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

// moment.

if (VF == 1) {

Type *ValTy = getMemInstValueType(I);

- unsigned Alignment = getLoadStoreAlignment(I);

+ const MaybeAlign Alignment = getLoadStoreAlignment(I);

unsigned AS = getLoadStoreAddressSpace(I);

return TTI.getAddressComputationCost(ValTy) +

- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);

+ TTI.getMemoryOpCost(I->getOpcode(), ValTy,

+ Alignment ? Alignment->value() : 0, AS, I);

}

return getWideningCost(I, VF);

}

@@ -6167,8 +6393,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,

}

VectorizationFactor

-LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,

- unsigned UserVF) {

+LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {

unsigned VF = UserVF;

// Outer loop handling: They may require CFG and instruction level

// transformations before even evaluating whether vectorization is profitable.

@@ -6207,10 +6432,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,

return VectorizationFactor::Disabled();

}

-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,

- unsigned UserVF) {

+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {

assert(OrigLoop->empty() && "Inner loop expected.");

- Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);

+ Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();

if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.

return None;

@@ -6840,8 +7064,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,

// If the tail is to be folded by masking, the primary induction variable

// needs to be represented in VPlan for it to model early-exit masking.

- if (CM.foldTailByMasking())

+ // Also, both the Phi and the live-out instruction of each reduction are

+ // required in order to introduce a select between them in VPlan.

+ if (CM.foldTailByMasking()) {

NeedDef.insert(Legal->getPrimaryInduction());

+ for (auto &Reduction : *Legal->getReductionVars()) {

+ NeedDef.insert(Reduction.first);

+ NeedDef.insert(Reduction.second.getLoopExitInstr());

+ }

// Collect instructions from the original loop that will become trivially dead

// in the vectorized loop. We don't need to vectorize these instructions. For

@@ -6873,7 +7104,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

// Create a dummy pre-entry VPBasicBlock to start building the VPlan.

VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");

- auto Plan = llvm::make_unique<VPlan>(VPBB);

+ auto Plan = std::make_unique<VPlan>(VPBB);

VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

// Represent values that will have defs inside VPlan.

@@ -6968,6 +7199,18 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBlockUtils::disconnectBlocks(PreEntry, Entry);

delete PreEntry;

+ // Finally, if tail is folded by masking, introduce selects between the phi

+ // and the live-out instruction of each reduction, at the end of the latch.

+ if (CM.foldTailByMasking()) {

+ Builder.setInsertPoint(VPBB);

+ auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);

+ for (auto &Reduction : *Legal->getReductionVars()) {

+ VPValue *Phi = Plan->getVPValue(Reduction.first);

+ VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());

+ Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});

+ }

std::string PlanName;

raw_string_ostream RSO(PlanName);

unsigned VF = Range.Start;

@@ -6993,7 +7236,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");

// Create new empty VPlan

- auto Plan = llvm::make_unique<VPlan>();

+ auto Plan = std::make_unique<VPlan>();

// Build hierarchical CFG

VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);

@@ -7199,6 +7442,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {

State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);

}

+static ScalarEpilogueLowering

+getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,

+ ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {

+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;

+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

+ (F->hasOptSize() ||

+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))

+ SEL = CM_ScalarEpilogueNotAllowedOptSize;

+ else if (PreferPredicateOverEpilog || Hints.getPredicate())

+ SEL = CM_ScalarEpilogueNotNeededUsePredicate;

+ return SEL;

// Process the loop in the VPlan-native vectorization path. This path builds

// VPlan upfront in the vectorization pipeline, which allows to apply

// VPlan-to-VPlan transformations from the very beginning without modifying the

@@ -7213,7 +7470,9 @@ static bool processLoopInVPlanNativePath(

assert(EnableVPlanNativePath && "VPlan-native path is disabled.");

Function *F = L->getHeader()->getParent();

InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

- LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);

+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,

&Hints, IAI);

// Use the planner for outer loop vectorization.

// TODO: CM is not used at this point inside the planner. Turn CM into an

@@ -7223,15 +7482,8 @@ static bool processLoopInVPlanNativePath(

// Get user vectorization factor.

const unsigned UserVF = Hints.getWidth();

- // Check the function attributes and profiles to find out if this function

- // should be optimized for size.

- bool OptForSize =

- Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

- (F->hasOptSize() ||

- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));

// Plan how to best vectorize, return the best VF and its cost.

- const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);

+ const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);

// If we are stress testing VPlan builds, do not attempt to generate vector

// code. Masked vector code generation support will follow soon.

@@ -7310,10 +7562,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check the function attributes and profiles to find out if this function

// should be optimized for size.

- bool OptForSize =

- Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

- (F->hasOptSize() ||

- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));

+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);

// Entrance to the VPlan-native vectorization path. Outer loops are processed

// here. They may require CFG and instruction level transformations before

@@ -7325,36 +7574,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {

ORE, BFI, PSI, Hints);

assert(L->empty() && "Inner loop expected.");

// Check the loop for a trip count threshold: vectorize loops with a tiny trip

// count by optimizing for size, to minimize overheads.

- // Prefer constant trip counts over profile data, over upper bound estimate.

- unsigned ExpectedTC = 0;

- bool HasExpectedTC = false;

- if (const SCEVConstant *ConstExits =

- dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {

- const APInt &ExitsCount = ConstExits->getAPInt();

- // We are interested in small values for ExpectedTC. Skip over those that

- // can't fit an unsigned.

- if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {

- ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;

- HasExpectedTC = true;

- }

- // ExpectedTC may be large because it's bound by a variable. Check

- // profiling information to validate we should vectorize.

- if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {

- auto EstimatedTC = getLoopEstimatedTripCount(L);

- if (EstimatedTC) {

- ExpectedTC = *EstimatedTC;

- HasExpectedTC = true;

- }

- if (!HasExpectedTC) {

- ExpectedTC = SE->getSmallConstantMaxTripCount(L);

- HasExpectedTC = (ExpectedTC > 0);

- }

- if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {

+ auto ExpectedTC = getSmallBestKnownTC(*SE, L);

+ if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {

LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "

<< "This loop is worth vectorizing only if no scalar "

<< "iteration overheads are incurred.");

@@ -7362,10 +7586,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");

else {

LLVM_DEBUG(dbgs() << "\n");

- // Loops with a very small trip count are considered for vectorization

- // under OptForSize, thereby making sure the cost of their loop body is

- // dominant, free of runtime guards and scalar iteration overheads.

- OptForSize = true;

+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;

}

@@ -7374,11 +7595,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// an integer loop and the vector instructions selected are purely integer

// vector instructions?

if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {

- LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"

- "attribute is used.\n");

- ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),

- "NoImplicitFloat", L)

- << "loop not vectorized due to NoImplicitFloat attribute");

+ reportVectorizationFailure(

+ "Can't vectorize when the NoImplicitFloat attribute is used",

+ "loop not vectorized due to NoImplicitFloat attribute",

+ "NoImplicitFloat", ORE, L);

Hints.emitRemarkWithHints();

return false;

}

@@ -7389,11 +7609,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// additional fp-math flags can help.

if (Hints.isPotentiallyUnsafe() &&

TTI->isFPVectorizationPotentiallyUnsafe()) {

- LLVM_DEBUG(

- dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");

- ORE->emit(

- createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)

- << "loop not vectorized due to unsafe FP support.");

+ reportVectorizationFailure(

+ "Potentially unsafe FP op prevents vectorization",

+ "loop not vectorized due to unsafe FP support.",

+ "UnsafeFP", ORE, L);

Hints.emitRemarkWithHints();

return false;

}

@@ -7411,8 +7630,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {

}

// Use the cost model.

- LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,

- &Hints, IAI);

+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,

+ F, &Hints, IAI);

CM.collectValuesToIgnore();

// Use the planner for vectorization.

@@ -7422,7 +7641,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

unsigned UserVF = Hints.getWidth();

// Plan how to best vectorize, return the best VF and its cost.

- Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);

+ Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);

VectorizationFactor VF = VectorizationFactor::Disabled();

unsigned IC = 1;

@@ -7431,7 +7650,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

if (MaybeVF) {

VF = *MaybeVF;

// Select the interleave count.

- IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);

+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);

}

// Identify the diagnostic messages that should be produced.

@@ -7609,7 +7828,8 @@ bool LoopVectorizePass::runImpl(

// The second condition is necessary because, even if the target has no

// vector registers, loop vectorization may still enable scalar

// interleaving.

- if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)

+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&

+ TTI->getMaxInterleaveFactor(1) < 2)

return false;

bool Changed = false;

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 27a86c0bca91..974eff9974d9 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -194,10 +194,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) {

return true;

}

-/// \returns True if all of the values in \p VL are constants.

+/// \returns True if all of the values in \p VL are constants (but not

+/// globals/constant expressions).

static bool allConstant(ArrayRef<Value *> VL) {

+ // Constant expressions and globals can't be vectorized like normal integer/FP

+ // constants.

for (Value *i : VL)

- if (!isa<Constant>(i))

+ if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))

return false;

return true;

}

@@ -486,6 +489,7 @@ namespace slpvectorizer {

/// Bottom Up SLP Vectorizer.

class BoUpSLP {

struct TreeEntry;

+ struct ScheduleData;

public:

using ValueList = SmallVector<Value *, 8>;

@@ -614,6 +618,15 @@ public:

/// vectorizable. We do not vectorize such trees.

bool isTreeTinyAndNotFullyVectorizable() const;

+ /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values

+ /// can be load combined in the backend. Load combining may not be allowed in

+ /// the IR optimizer, so we do not want to alter the pattern. For example,

+ /// partially transforming a scalar bswap() pattern into vector code is

+ /// effectively impossible for the backend to undo.

+ /// TODO: If load combining is allowed in the IR optimizer, this analysis

+ /// may not be necessary.

+ bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;

OptimizationRemarkEmitter *getORE() { return ORE; }

/// This structure holds any data we need about the edges being traversed

@@ -1117,6 +1130,14 @@ public:

#endif

};

+ /// Checks if the instruction is marked for deletion.

+ bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }

+ /// Marks values operands for later deletion by replacing them with Undefs.

+ void eraseInstructions(ArrayRef<Value *> AV);

+ ~BoUpSLP();

private:

/// Checks if all users of \p I are the part of the vectorization tree.

bool areAllUsersVectorized(Instruction *I) const;

@@ -1153,8 +1174,7 @@ private:

/// Set the Builder insert point to one after the last instruction in

/// the bundle

- void setInsertPointAfterBundle(ArrayRef<Value *> VL,

- const InstructionsState &S);

+ void setInsertPointAfterBundle(TreeEntry *E);

/// \returns a vector from a collection of scalars in \p VL.

Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);

@@ -1220,27 +1240,37 @@ private:

/// reordering of operands during buildTree_rec() and vectorizeTree().

SmallVector<ValueList, 2> Operands;

+ /// The main/alternate instruction.

+ Instruction *MainOp = nullptr;

+ Instruction *AltOp = nullptr;

public:

/// Set this bundle's \p OpIdx'th operand to \p OpVL.

- void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,

- ArrayRef<unsigned> ReuseShuffleIndices) {

+ void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {

if (Operands.size() < OpIdx + 1)

Operands.resize(OpIdx + 1);

assert(Operands[OpIdx].size() == 0 && "Already resized?");

Operands[OpIdx].resize(Scalars.size());

for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)

- Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())

- ? OpVL[ReuseShuffleIndices[Lane]]

- : OpVL[Lane];

- }

- /// If there is a user TreeEntry, then set its operand.

- void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,

- ArrayRef<Value *> OpVL,

- ArrayRef<unsigned> ReuseShuffleIndices) {

- if (UserTreeIdx.UserTE)

- UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,

- ReuseShuffleIndices);

+ Operands[OpIdx][Lane] = OpVL[Lane];

+ }

+ /// Set the operands of this bundle in their original order.

+ void setOperandsInOrder() {

+ assert(Operands.empty() && "Already initialized?");

+ auto *I0 = cast<Instruction>(Scalars[0]);

+ Operands.resize(I0->getNumOperands());

+ unsigned NumLanes = Scalars.size();

+ for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();

+ OpIdx != NumOperands; ++OpIdx) {

+ Operands[OpIdx].resize(NumLanes);

+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

+ auto *I = cast<Instruction>(Scalars[Lane]);

+ assert(I->getNumOperands() == NumOperands &&

+ "Expected same number of operands");

+ Operands[OpIdx][Lane] = I->getOperand(OpIdx);

+ }

}

/// \returns the \p OpIdx operand of this TreeEntry.

@@ -1249,6 +1279,9 @@ private:

return Operands[OpIdx];

}

+ /// \returns the number of operands.

+ unsigned getNumOperands() const { return Operands.size(); }

/// \return the single \p OpIdx operand.

Value *getSingleOperand(unsigned OpIdx) const {

assert(OpIdx < Operands.size() && "Off bounds");

@@ -1256,6 +1289,58 @@ private:

return Operands[OpIdx][0];

}

+ /// Some of the instructions in the list have alternate opcodes.

+ bool isAltShuffle() const {

+ return getOpcode() != getAltOpcode();

+ }

+ bool isOpcodeOrAlt(Instruction *I) const {

+ unsigned CheckedOpcode = I->getOpcode();

+ return (getOpcode() == CheckedOpcode ||

+ getAltOpcode() == CheckedOpcode);

+ }

+ /// Chooses the correct key for scheduling data. If \p Op has the same (or

+ /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is

+ /// \p OpValue.

+ Value *isOneOf(Value *Op) const {

+ auto *I = dyn_cast<Instruction>(Op);

+ if (I && isOpcodeOrAlt(I))

+ return Op;

+ return MainOp;

+ }

+ void setOperations(const InstructionsState &S) {

+ MainOp = S.MainOp;

+ AltOp = S.AltOp;

+ }

+ Instruction *getMainOp() const {

+ return MainOp;

+ }

+ Instruction *getAltOp() const {

+ return AltOp;

+ }

+ /// The main/alternate opcodes for the list of instructions.

+ unsigned getOpcode() const {

+ return MainOp ? MainOp->getOpcode() : 0;

+ }

+ unsigned getAltOpcode() const {

+ return AltOp ? AltOp->getOpcode() : 0;

+ }

+ /// Update operations state of this entry if reorder occurred.

+ bool updateStateIfReorder() {

+ if (ReorderIndices.empty())

+ return false;

+ InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());

+ setOperations(S);

+ return true;

+ }

#ifndef NDEBUG

/// Debug printer.

LLVM_DUMP_METHOD void dump() const {

@@ -1269,6 +1354,8 @@ private:

for (Value *V : Scalars)

dbgs().indent(2) << *V << "\n";

dbgs() << "NeedToGather: " << NeedToGather << "\n";

+ dbgs() << "MainOp: " << *MainOp << "\n";

+ dbgs() << "AltOp: " << *AltOp << "\n";

dbgs() << "VectorizedValue: ";

if (VectorizedValue)

dbgs() << *VectorizedValue;

@@ -1279,12 +1366,12 @@ private:

if (ReuseShuffleIndices.empty())

dbgs() << "Emtpy";

else

- for (unsigned Idx : ReuseShuffleIndices)

- dbgs() << Idx << ", ";

+ for (unsigned ReuseIdx : ReuseShuffleIndices)

+ dbgs() << ReuseIdx << ", ";

dbgs() << "\n";

dbgs() << "ReorderIndices: ";

- for (unsigned Idx : ReorderIndices)

- dbgs() << Idx << ", ";

+ for (unsigned ReorderIdx : ReorderIndices)

+ dbgs() << ReorderIdx << ", ";

dbgs() << "\n";

dbgs() << "UserTreeIndices: ";

for (const auto &EInfo : UserTreeIndices)

@@ -1295,11 +1382,13 @@ private:

};

/// Create a new VectorizableTree entry.

- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,

+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,

+ const InstructionsState &S,

const EdgeInfo &UserTreeIdx,

ArrayRef<unsigned> ReuseShuffleIndices = None,

ArrayRef<unsigned> ReorderIndices = None) {

- VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));

+ bool Vectorized = (bool)Bundle;

+ VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));

TreeEntry *Last = VectorizableTree.back().get();

Last->Idx = VectorizableTree.size() - 1;

Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

@@ -1307,11 +1396,22 @@ private:

Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

ReuseShuffleIndices.end());

Last->ReorderIndices = ReorderIndices;

+ Last->setOperations(S);

if (Vectorized) {

for (int i = 0, e = VL.size(); i != e; ++i) {

assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");

- ScalarToTreeEntry[VL[i]] = Last->Idx;

- }

+ ScalarToTreeEntry[VL[i]] = Last;

+ }

+ // Update the scheduler bundle to point to this TreeEntry.

+ unsigned Lane = 0;

+ for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;

+ BundleMember = BundleMember->NextInBundle) {

+ BundleMember->TE = Last;

+ BundleMember->Lane = Lane;

+ ++Lane;

+ }

+ assert((!Bundle.getValue() || Lane == VL.size()) &&

+ "Bundle and VL out of sync");

} else {

MustGather.insert(VL.begin(), VL.end());

}

@@ -1319,7 +1419,6 @@ private:

if (UserTreeIdx.UserTE)

Last->UserTreeIndices.push_back(UserTreeIdx);

- Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);

return Last;

}

@@ -1340,19 +1439,19 @@ private:

TreeEntry *getTreeEntry(Value *V) {

auto I = ScalarToTreeEntry.find(V);

if (I != ScalarToTreeEntry.end())

- return VectorizableTree[I->second].get();

+ return I->second;

return nullptr;

}

const TreeEntry *getTreeEntry(Value *V) const {

auto I = ScalarToTreeEntry.find(V);

if (I != ScalarToTreeEntry.end())

- return VectorizableTree[I->second].get();

+ return I->second;

return nullptr;

}

/// Maps a specific scalar to its tree entry.

- SmallDenseMap<Value*, int> ScalarToTreeEntry;

+ SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;

/// A list of scalars that we found that we need to keep as scalars.

ValueSet MustGather;

@@ -1408,15 +1507,14 @@ private:

/// This is required to ensure that there are no incorrect collisions in the

/// AliasCache, which can happen if a new instruction is allocated at the

/// same address as a previously deleted instruction.

- void eraseInstruction(Instruction *I) {

- I->removeFromParent();

- I->dropAllReferences();

- DeletedInstructions.emplace_back(I);

+ void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {

+ auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;

+ It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;

}

/// Temporary store for deleted instructions. Instructions will be deleted

/// eventually when the BoUpSLP is destructed.

- SmallVector<unique_value, 8> DeletedInstructions;

+ DenseMap<Instruction *, bool> DeletedInstructions;

/// A list of values that need to extracted out of the tree.

/// This list holds pairs of (Internal Scalar : External User). External User

@@ -1453,6 +1551,8 @@ private:

UnscheduledDepsInBundle = UnscheduledDeps;

clearDependencies();

OpValue = OpVal;

+ TE = nullptr;

+ Lane = -1;

}

/// Returns true if the dependency information has been calculated.

@@ -1559,6 +1659,12 @@ private:

/// Opcode of the current instruction in the schedule data.

Value *OpValue = nullptr;

+ /// The TreeEntry that this instruction corresponds to.

+ TreeEntry *TE = nullptr;

+ /// The lane of this node in the TreeEntry.

+ int Lane = -1;

};

#ifndef NDEBUG

@@ -1633,10 +1739,9 @@ private:

continue;

}

// Handle the def-use chain dependencies.

- for (Use &U : BundleMember->Inst->operands()) {

- auto *I = dyn_cast<Instruction>(U.get());

- if (!I)

- continue;

+ // Decrement the unscheduled counter and insert to ready list if ready.

+ auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {

doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {

if (OpDef && OpDef->hasValidDependencies() &&

OpDef->incrementUnscheduledDeps(-1) == 0) {

@@ -1651,6 +1756,24 @@ private:

<< "SLP: gets ready (def): " << *DepBundle << "\n");

}

});

+ };

+ // If BundleMember is a vector bundle, its operands may have been

+ // reordered duiring buildTree(). We therefore need to get its operands

+ // through the TreeEntry.

+ if (TreeEntry *TE = BundleMember->TE) {

+ int Lane = BundleMember->Lane;

+ assert(Lane >= 0 && "Lane not set");

+ for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();

+ OpIdx != NumOperands; ++OpIdx)

+ if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))

+ DecrUnsched(I);

+ } else {

+ // If BundleMember is a stand-alone instruction, no operand reordering

+ // has taken place, so we directly access its operands.

+ for (Use &U : BundleMember->Inst->operands())

+ if (auto *I = dyn_cast<Instruction>(U.get()))

+ DecrUnsched(I);

}

// Handle the memory dependencies.

for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {

@@ -1697,8 +1820,11 @@ private:

/// Checks if a bundle of instructions can be scheduled, i.e. has no

/// cyclic dependencies. This is only a dry-run, no instructions are

/// actually moved at this stage.

- bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

- const InstructionsState &S);

+ /// \returns the scheduling bundle. The returned Optional value is non-None

+ /// if \p VL is allowed to be scheduled.

+ Optional<ScheduleData *>

+ tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

+ const InstructionsState &S);

/// Un-bundles a group of instructions.

void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);

@@ -1945,6 +2071,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

} // end namespace llvm

+BoUpSLP::~BoUpSLP() {

+ for (const auto &Pair : DeletedInstructions) {

+ // Replace operands of ignored instructions with Undefs in case if they were

+ // marked for deletion.

+ if (Pair.getSecond()) {

+ Value *Undef = UndefValue::get(Pair.getFirst()->getType());

+ Pair.getFirst()->replaceAllUsesWith(Undef);

+ }

+ Pair.getFirst()->dropAllReferences();

+ }

+ for (const auto &Pair : DeletedInstructions) {

+ assert(Pair.getFirst()->use_empty() &&

+ "trying to erase instruction with users.");

+ Pair.getFirst()->eraseFromParent();

+ }

+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {

+ for (auto *V : AV) {

+ if (auto *I = dyn_cast<Instruction>(V))

+ eraseInstruction(I, /*ReplaceWithUndef=*/true);

+ };

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

ArrayRef<Value *> UserIgnoreLst) {

ExtraValueToDebugLocsMap ExternallyUsedValues;

@@ -2026,28 +2176,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

InstructionsState S = getSameOpcode(VL);

if (Depth == RecursionMaxDepth) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

// Don't handle vectors.

if (S.OpValue->getType()->isVectorTy()) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))

if (SI->getValueOperand()->getType()->isVectorTy()) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

// If all of the operands are identical or constant we have a simple solution.

if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

@@ -2055,11 +2205,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// the same block.

// Don't vectorize ephemeral values.

- for (unsigned i = 0, e = VL.size(); i != e; ++i) {

- if (EphValues.count(VL[i])) {

- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]

+ for (Value *V : VL) {

+ if (EphValues.count(V)) {

+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

<< ") is ephemeral.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

@@ -2069,7 +2219,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");

if (!E->isSame(VL)) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

// Record the reuse of the tree node. FIXME, currently this is only used to

@@ -2077,19 +2227,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

E->UserTreeIndices.push_back(UserTreeIdx);

LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue

<< ".\n");

- E->trySetUserTEOperand(UserTreeIdx, VL, None);

return;

}

// Check that none of the instructions in the bundle are already in the tree.

- for (unsigned i = 0, e = VL.size(); i != e; ++i) {

- auto *I = dyn_cast<Instruction>(VL[i]);

+ for (Value *V : VL) {

+ auto *I = dyn_cast<Instruction>(V);

if (!I)

continue;

if (getTreeEntry(I)) {

- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i]

+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V

<< ") is already in tree.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

@@ -2097,10 +2246,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// If any of the scalars is marked as a value that needs to stay scalar, then

// we need to gather the scalars.

// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

- for (unsigned i = 0, e = VL.size(); i != e; ++i) {

- if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {

+ for (Value *V : VL) {

+ if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

@@ -2114,7 +2263,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Don't go into unreachable blocks. They may contain instructions with

// dependency cycles which confuse the final scheduling.

LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

@@ -2128,13 +2277,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (Res.second)

UniqueValues.emplace_back(V);

}

- if (UniqueValues.size() == VL.size()) {

+ size_t NumUniqueScalarValues = UniqueValues.size();

+ if (NumUniqueScalarValues == VL.size()) {

ReuseShuffleIndicies.clear();

} else {

LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");

- if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {

+ if (NumUniqueScalarValues <= 1 ||

+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

- newTreeEntry(VL, false, UserTreeIdx);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

return;

}

VL = UniqueValues;

@@ -2142,16 +2293,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

auto &BSRef = BlocksSchedules[BB];

if (!BSRef)

- BSRef = llvm::make_unique<BlockScheduling>(BB);

+ BSRef = std::make_unique<BlockScheduling>(BB);

BlockScheduling &BS = *BSRef.get();

- if (!BS.tryScheduleBundle(VL, this, S)) {

+ Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);

+ if (!Bundle) {

LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");

assert((!BS.getScheduleData(VL0) ||

!BS.getScheduleData(VL0)->isPartOfBundle()) &&

"tryScheduleBundle should cancelScheduling on failure");

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");

@@ -2160,7 +2313,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

(unsigned) Instruction::ShuffleVector : S.getOpcode();

switch (ShuffleOrOp) {

case Instruction::PHI: {

- PHINode *PH = dyn_cast<PHINode>(VL0);

+ auto *PH = cast<PHINode>(VL0);

// Check for terminator values (e.g. invoke).

for (unsigned j = 0; j < VL.size(); ++j)

@@ -2172,23 +2325,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs()

<< "SLP: Need to swizzle PHINodes (terminator use).\n");

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE =

+ newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

+ // Keeps the reordered operands to avoid code duplication.

+ SmallVector<ValueList, 2> OperandsVec;

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

ValueList Operands;

// Prepare the operand vector.

for (Value *j : VL)

Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(

PH->getIncomingBlock(i)));

- buildTree_rec(Operands, Depth + 1, {TE, i});

+ TE->setOperand(i, Operands);

+ OperandsVec.push_back(Operands);

}

+ for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)

+ buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});

return;

}

case Instruction::ExtractValue:

@@ -2198,13 +2357,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (Reuse) {

LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");

++NumOpsWantToKeepOriginalOrder;

- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,

+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

ReuseShuffleIndicies);

// This is a special case, as it does not gather, but at the same time

// we are not extending buildTree_rec() towards the operands.

ValueList Op0;

Op0.assign(VL.size(), VL0->getOperand(0));

- VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);

+ VectorizableTree.back()->setOperand(0, Op0);

return;

}

if (!CurrentOrder.empty()) {

@@ -2220,17 +2379,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

auto StoredCurrentOrderAndNum =

NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;

++StoredCurrentOrderAndNum->getSecond();

- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,

+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies,

StoredCurrentOrderAndNum->getFirst());

// This is a special case, as it does not gather, but at the same time

// we are not extending buildTree_rec() towards the operands.

ValueList Op0;

Op0.assign(VL.size(), VL0->getOperand(0));

- VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);

+ VectorizableTree.back()->setOperand(0, Op0);

return;

}

LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");

- newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

BS.cancelScheduling(VL, VL0);

return;

}

@@ -2246,7 +2407,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (DL->getTypeSizeInBits(ScalarTy) !=

DL->getTypeAllocSizeInBits(ScalarTy)) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

return;

}

@@ -2259,7 +2421,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

auto *L = cast<LoadInst>(V);

if (!L->isSimple()) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

return;

}

@@ -2289,15 +2452,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (CurrentOrder.empty()) {

// Original loads are consecutive and does not require reordering.

++NumOpsWantToKeepOriginalOrder;

- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,

- ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

+ UserTreeIdx, ReuseShuffleIndicies);

+ TE->setOperandsInOrder();

LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");

} else {

// Need to reorder.

auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;

++I->getSecond();

- newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,

- ReuseShuffleIndicies, I->getFirst());

+ TreeEntry *TE =

+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies, I->getFirst());

+ TE->setOperandsInOrder();

LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");

}

return;

@@ -2306,7 +2472,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

case Instruction::ZExt:

@@ -2322,24 +2489,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::FPTrunc:

case Instruction::BitCast: {

Type *SrcTy = VL0->getOperand(0)->getType();

- for (unsigned i = 0; i < VL.size(); ++i) {

- Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();

+ for (Value *V : VL) {

+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

if (Ty != SrcTy || !isValidElementType(Ty)) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs()

<< "SLP: Gathering casts with different src types.\n");

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

+ TE->setOperandsInOrder();

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

ValueList Operands;

// Prepare the operand vector.

- for (Value *j : VL)

- Operands.push_back(cast<Instruction>(j)->getOperand(i));

+ for (Value *V : VL)

+ Operands.push_back(cast<Instruction>(V)->getOperand(i));

buildTree_rec(Operands, Depth + 1, {TE, i});

}

@@ -2351,19 +2521,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);

Type *ComparedTy = VL0->getOperand(0)->getType();

- for (unsigned i = 1, e = VL.size(); i < e; ++i) {

- CmpInst *Cmp = cast<CmpInst>(VL[i]);

+ for (Value *V : VL) {

+ CmpInst *Cmp = cast<CmpInst>(V);

if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||

Cmp->getOperand(0)->getType() != ComparedTy) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs()

<< "SLP: Gathering cmp with different predicate.\n");

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");

ValueList Left, Right;

@@ -2384,7 +2556,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

Right.push_back(RHS);

}

+ TE->setOperand(0, Left);

+ TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

buildTree_rec(Right, Depth + 1, {TE, 1});

return;

@@ -2409,7 +2582,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");

// Sort operands of the instructions so that each side is more likely to

@@ -2417,11 +2591,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

ValueList Left, Right;

reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ TE->setOperand(0, Left);

+ TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

buildTree_rec(Right, Depth + 1, {TE, 1});

return;

}

+ TE->setOperandsInOrder();

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

ValueList Operands;

// Prepare the operand vector.

@@ -2434,11 +2611,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

case Instruction::GetElementPtr: {

// We don't combine GEPs with complicated (nested) indexing.

- for (unsigned j = 0; j < VL.size(); ++j) {

- if (cast<Instruction>(VL[j])->getNumOperands() != 2) {

+ for (Value *V : VL) {

+ if (cast<Instruction>(V)->getNumOperands() != 2) {

LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

@@ -2446,58 +2624,64 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// We can't combine several GEPs into one vector if they operate on

// different types.

Type *Ty0 = VL0->getOperand(0)->getType();

- for (unsigned j = 0; j < VL.size(); ++j) {

- Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();

+ for (Value *V : VL) {

+ Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();

if (Ty0 != CurTy) {

LLVM_DEBUG(dbgs()

<< "SLP: not-vectorizable GEP (different types).\n");

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

// We don't combine GEPs with non-constant indexes.

- for (unsigned j = 0; j < VL.size(); ++j) {

- auto Op = cast<Instruction>(VL[j])->getOperand(1);

+ for (Value *V : VL) {

+ auto Op = cast<Instruction>(V)->getOperand(1);

if (!isa<ConstantInt>(Op)) {

LLVM_DEBUG(dbgs()

<< "SLP: not-vectorizable GEP (non-constant indexes).\n");

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");

+ TE->setOperandsInOrder();

for (unsigned i = 0, e = 2; i < e; ++i) {

ValueList Operands;

// Prepare the operand vector.

- for (Value *j : VL)

- Operands.push_back(cast<Instruction>(j)->getOperand(i));

+ for (Value *V : VL)

+ Operands.push_back(cast<Instruction>(V)->getOperand(i));

buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

}

case Instruction::Store: {

- // Check if the stores are consecutive or of we need to swizzle them.

+ // Check if the stores are consecutive or if we need to swizzle them.

for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)

if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

ValueList Operands;

- for (Value *j : VL)

- Operands.push_back(cast<Instruction>(j)->getOperand(0));

+ for (Value *V : VL)

+ Operands.push_back(cast<Instruction>(V)->getOperand(0));

+ TE->setOperandsInOrder();

buildTree_rec(Operands, Depth + 1, {TE, 0});

return;

}

@@ -2509,7 +2693,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

if (!isTriviallyVectorizable(ID)) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");

return;

}

@@ -2519,14 +2704,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (unsigned j = 0; j != NumArgs; ++j)

if (hasVectorInstrinsicScalarOpd(ID, j))

ScalarArgs[j] = CI->getArgOperand(j);

- for (unsigned i = 1, e = VL.size(); i != e; ++i) {

- CallInst *CI2 = dyn_cast<CallInst>(VL[i]);

+ for (Value *V : VL) {

+ CallInst *CI2 = dyn_cast<CallInst>(V);

if (!CI2 || CI2->getCalledFunction() != Int ||

getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

!CI->hasIdenticalOperandBundleSchema(*CI2)) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V

<< "\n");

return;

}

@@ -2537,7 +2723,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

Value *A1J = CI2->getArgOperand(j);

if (ScalarArgs[j] != A1J) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI

<< " argument " << ScalarArgs[j] << "!=" << A1J

<< "\n");

@@ -2551,19 +2738,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

CI->op_begin() + CI->getBundleOperandsEndIndex(),

CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"

- << *CI << "!=" << *VL[i] << '\n');

+ << *CI << "!=" << *V << '\n');

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

+ TE->setOperandsInOrder();

for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

ValueList Operands;

// Prepare the operand vector.

- for (Value *j : VL) {

- CallInst *CI2 = dyn_cast<CallInst>(j);

+ for (Value *V : VL) {

+ auto *CI2 = cast<CallInst>(V);

Operands.push_back(CI2->getArgOperand(i));

}

buildTree_rec(Operands, Depth + 1, {TE, i});

@@ -2575,27 +2765,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// then do not vectorize this instruction.

if (!S.isAltShuffle()) {

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");

return;

}

- auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

// Reorder operands if reordering would enable vectorization.

if (isa<BinaryOperator>(VL0)) {

ValueList Left, Right;

reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ TE->setOperand(0, Left);

+ TE->setOperand(1, Right);

buildTree_rec(Left, Depth + 1, {TE, 0});

buildTree_rec(Right, Depth + 1, {TE, 1});

return;

}

+ TE->setOperandsInOrder();

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

ValueList Operands;

// Prepare the operand vector.

- for (Value *j : VL)

- Operands.push_back(cast<Instruction>(j)->getOperand(i));

+ for (Value *V : VL)

+ Operands.push_back(cast<Instruction>(V)->getOperand(i));

buildTree_rec(Operands, Depth + 1, {TE, i});

}

@@ -2603,7 +2798,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

default:

BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

+ ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");

return;

}

@@ -2738,7 +2934,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

return ReuseShuffleCost +

TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);

}

- if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement &&

+ if (E->getOpcode() == Instruction::ExtractElement &&

allSameType(VL) && allSameBlock(VL)) {

Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);

if (ShuffleKind.hasValue()) {

@@ -2761,11 +2957,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

}

return ReuseShuffleCost + getGatherCost(VL);

}

- InstructionsState S = getSameOpcode(VL);

- assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");

- Instruction *VL0 = cast<Instruction>(S.OpValue);

- unsigned ShuffleOrOp = S.isAltShuffle() ?

- (unsigned) Instruction::ShuffleVector : S.getOpcode();

+ assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");

+ Instruction *VL0 = E->getMainOp();

+ unsigned ShuffleOrOp =

+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

switch (ShuffleOrOp) {

case Instruction::PHI:

return 0;

@@ -2851,7 +3046,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

case Instruction::BitCast: {

Type *SrcTy = VL0->getOperand(0)->getType();

int ScalarEltCost =

- TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0);

+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);

if (NeedToShuffleReuses) {

ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

}

@@ -2864,7 +3059,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

// Check if the values are candidates to demote.

if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {

VecCost = ReuseShuffleCost +

- TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0);

+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);

}

return VecCost - ScalarCost;

}

@@ -2872,14 +3067,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

case Instruction::ICmp:

case Instruction::Select: {

// Calculate the cost of this instruction.

- int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy,

+ int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,

Builder.getInt1Ty(), VL0);

if (NeedToShuffleReuses) {

ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

}

VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());

int ScalarCost = VecTy->getNumElements() * ScalarEltCost;

- int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);

+ int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);

return ReuseShuffleCost + VecCost - ScalarCost;

}

case Instruction::FNeg:

@@ -2940,12 +3135,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

SmallVector<const Value *, 4> Operands(VL0->operand_values());

int ScalarEltCost = TTI->getArithmeticInstrCost(

- S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);

+ E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands);

if (NeedToShuffleReuses) {

ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;

}

int ScalarCost = VecTy->getNumElements() * ScalarEltCost;

- int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK,

+ int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK,

Op2VK, Op1VP, Op2VP, Operands);

return ReuseShuffleCost + VecCost - ScalarCost;

}

@@ -3027,11 +3222,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

return ReuseShuffleCost + VecCallCost - ScalarCallCost;

}

case Instruction::ShuffleVector: {

- assert(S.isAltShuffle() &&

- ((Instruction::isBinaryOp(S.getOpcode()) &&

- Instruction::isBinaryOp(S.getAltOpcode())) ||

- (Instruction::isCast(S.getOpcode()) &&

- Instruction::isCast(S.getAltOpcode()))) &&

+ assert(E->isAltShuffle() &&

+ ((Instruction::isBinaryOp(E->getOpcode()) &&

+ Instruction::isBinaryOp(E->getAltOpcode())) ||

+ (Instruction::isCast(E->getOpcode()) &&

+ Instruction::isCast(E->getAltOpcode()))) &&

"Invalid Shuffle Vector Operand");

int ScalarCost = 0;

if (NeedToShuffleReuses) {

@@ -3046,25 +3241,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

I, TargetTransformInfo::TCK_RecipThroughput);

}

- for (Value *i : VL) {

- Instruction *I = cast<Instruction>(i);

- assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

+ for (Value *V : VL) {

+ Instruction *I = cast<Instruction>(V);

+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");

ScalarCost += TTI->getInstructionCost(

I, TargetTransformInfo::TCK_RecipThroughput);

}

// VecCost is equal to sum of the cost of creating 2 vectors

// and the cost of creating shuffle.

int VecCost = 0;

- if (Instruction::isBinaryOp(S.getOpcode())) {

- VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy);

- VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy);

+ if (Instruction::isBinaryOp(E->getOpcode())) {

+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);

+ VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);

} else {

- Type *Src0SclTy = S.MainOp->getOperand(0)->getType();

- Type *Src1SclTy = S.AltOp->getOperand(0)->getType();

+ Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();

+ Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();

VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());

VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());

- VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty);

- VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty);

+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);

+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);

}

VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);

return ReuseShuffleCost + VecCost - ScalarCost;

@@ -3098,6 +3293,43 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {

return true;

}

+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {

+ if (RdxOpcode != Instruction::Or)

+ return false;

+ unsigned NumElts = VectorizableTree[0]->Scalars.size();

+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];

+ // Look past the reduction to find a source value. Arbitrarily follow the

+ // path through operand 0 of any 'or'. Also, peek through optional

+ // shift-left-by-constant.

+ Value *ZextLoad = FirstReduced;

+ while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||

+ match(ZextLoad, m_Shl(m_Value(), m_Constant())))

+ ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);

+ // Check if the input to the reduction is an extended load.

+ Value *LoadPtr;

+ if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))

+ return false;

+ // Require that the total load bit width is a legal integer type.

+ // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

+ // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

+ Type *SrcTy = LoadPtr->getType()->getPointerElementType();

+ unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

+ LLVMContext &Context = FirstReduced->getContext();

+ if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))

+ return false;

+ // Everything matched - assume that we can fold the whole sequence using

+ // load combining.

+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "

+ << *(cast<Instruction>(FirstReduced)) << "\n");

+ return true;

bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {

// We can vectorize the tree if its size is greater than or equal to the

// minimum size specified by the MinTreeSize command line option.

@@ -3319,16 +3551,16 @@ void BoUpSLP::reorderInputsAccordingToOpcode(

Right = Ops.getVL(1);

}

-void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,

- const InstructionsState &S) {

+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {

// Get the basic block this bundle is in. All instructions in the bundle

// should be in this block.

- auto *Front = cast<Instruction>(S.OpValue);

+ auto *Front = E->getMainOp();

auto *BB = Front->getParent();

- assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {

- auto *I = cast<Instruction>(V);

- return !S.isOpcodeOrAlt(I) || I->getParent() == BB;

- }));

+ assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),

+ [=](Value *V) -> bool {

+ auto *I = cast<Instruction>(V);

+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB;

+ }));

// The last instruction in the bundle in program order.

Instruction *LastInst = nullptr;

@@ -3339,7 +3571,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,

// bundle. The end of the bundle is marked by null ScheduleData.

if (BlocksSchedules.count(BB)) {

auto *Bundle =

- BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back()));

+ BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));

if (Bundle && Bundle->isPartOfBundle())

for (; Bundle; Bundle = Bundle->NextInBundle)

if (Bundle->OpValue == Bundle->Inst)

@@ -3365,14 +3597,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,

// we both exit early from buildTree_rec and that the bundle be out-of-order

// (causing us to iterate all the way to the end of the block).

if (!LastInst) {

- SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());

+ SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());

for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {

- if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I))

+ if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))

LastInst = &I;

if (Bundle.empty())

break;

}

+ assert(LastInst && "Failed to find last instruction in bundle");

// Set the insertion point after the last instruction in the bundle. Set the

// debug location to Front.

@@ -3385,7 +3618,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {

// Generate the 'InsertElement' instruction.

for (unsigned i = 0; i < Ty->getNumElements(); ++i) {

Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));

- if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {

+ if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {

GatherSeq.insert(Insrt);

CSEBlocks.insert(Insrt->getParent());

@@ -3494,8 +3727,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return E->VectorizedValue;

}

- InstructionsState S = getSameOpcode(E->Scalars);

- Instruction *VL0 = cast<Instruction>(S.OpValue);

+ Instruction *VL0 = E->getMainOp();

Type *ScalarTy = VL0->getType();

if (StoreInst *SI = dyn_cast<StoreInst>(VL0))

ScalarTy = SI->getValueOperand()->getType();

@@ -3504,7 +3736,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

if (E->NeedToGather) {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

auto *V = Gather(E->Scalars, VecTy);

if (NeedToShuffleReuses) {

V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

@@ -3518,11 +3750,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

- unsigned ShuffleOrOp = S.isAltShuffle() ?

- (unsigned) Instruction::ShuffleVector : S.getOpcode();

+ unsigned ShuffleOrOp =

+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

switch (ShuffleOrOp) {

case Instruction::PHI: {

- PHINode *PH = dyn_cast<PHINode>(VL0);

+ auto *PH = cast<PHINode>(VL0);

Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

@@ -3577,7 +3809,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

E->VectorizedValue = V;

return V;

}

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

auto *V = Gather(E->Scalars, VecTy);

if (NeedToShuffleReuses) {

V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

@@ -3612,7 +3844,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

E->VectorizedValue = NewV;

return NewV;

}

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

auto *V = Gather(E->Scalars, VecTy);

if (NeedToShuffleReuses) {

V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

@@ -3637,7 +3869,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *InVec = vectorizeTree(E->getOperand(0));

@@ -3646,7 +3878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return E->VectorizedValue;

}

- CastInst *CI = dyn_cast<CastInst>(VL0);

+ auto *CI = cast<CastInst>(VL0);

Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

if (NeedToShuffleReuses) {

V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

@@ -3658,7 +3890,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

case Instruction::FCmp:

case Instruction::ICmp: {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *L = vectorizeTree(E->getOperand(0));

Value *R = vectorizeTree(E->getOperand(1));

@@ -3670,7 +3902,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

Value *V;

- if (S.getOpcode() == Instruction::FCmp)

+ if (E->getOpcode() == Instruction::FCmp)

V = Builder.CreateFCmp(P0, L, R);

else

V = Builder.CreateICmp(P0, L, R);

@@ -3685,7 +3917,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::Select: {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *Cond = vectorizeTree(E->getOperand(0));

Value *True = vectorizeTree(E->getOperand(1));

@@ -3706,7 +3938,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::FNeg: {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *Op = vectorizeTree(E->getOperand(0));

@@ -3716,7 +3948,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V = Builder.CreateUnOp(

- static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);

+ static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);

propagateIRFlags(V, E->Scalars, VL0);

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

@@ -3748,7 +3980,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *LHS = vectorizeTree(E->getOperand(0));

Value *RHS = vectorizeTree(E->getOperand(1));

@@ -3759,7 +3991,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V = Builder.CreateBinOp(

- static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);

+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

+ RHS);

propagateIRFlags(V, E->Scalars, VL0);

if (auto *I = dyn_cast<Instruction>(V))

V = propagateMetadata(I, E->Scalars);

@@ -3776,12 +4009,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Load: {

// Loads are inserted at the head of the tree because we don't want to

// sink them all the way down past store instructions.

- bool IsReorder = !E->ReorderIndices.empty();

- if (IsReorder) {

- S = getSameOpcode(E->Scalars, E->ReorderIndices.front());

- VL0 = cast<Instruction>(S.OpValue);

- }

- setInsertPointAfterBundle(E->Scalars, S);

+ bool IsReorder = E->updateStateIfReorder();

+ if (IsReorder)

+ VL0 = E->getMainOp();

+ setInsertPointAfterBundle(E);

LoadInst *LI = cast<LoadInst>(VL0);

Type *ScalarLoadTy = LI->getType();

@@ -3797,11 +4028,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (getTreeEntry(PO))

ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));

- unsigned Alignment = LI->getAlignment();

+ MaybeAlign Alignment = MaybeAlign(LI->getAlignment());

LI = Builder.CreateLoad(VecTy, VecPtr);

- if (!Alignment) {

- Alignment = DL->getABITypeAlignment(ScalarLoadTy);

- }

+ if (!Alignment)

+ Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));

LI->setAlignment(Alignment);

Value *V = propagateMetadata(LI, E->Scalars);

if (IsReorder) {

@@ -3824,7 +4054,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

unsigned Alignment = SI->getAlignment();

unsigned AS = SI->getPointerAddressSpace();

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *VecValue = vectorizeTree(E->getOperand(0));

Value *ScalarPtr = SI->getPointerOperand();

@@ -3840,7 +4070,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

if (!Alignment)

Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());

- ST->setAlignment(Alignment);

+ ST->setAlignment(Align(Alignment));

Value *V = propagateMetadata(ST, E->Scalars);

if (NeedToShuffleReuses) {

V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

@@ -3851,7 +4081,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::GetElementPtr: {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

Value *Op0 = vectorizeTree(E->getOperand(0));

@@ -3878,13 +4108,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

case Instruction::Call: {

CallInst *CI = cast<CallInst>(VL0);

- setInsertPointAfterBundle(E->Scalars, S);

- Function *FI;

+ setInsertPointAfterBundle(E);

Intrinsic::ID IID = Intrinsic::not_intrinsic;

- Value *ScalarArg = nullptr;

- if (CI && (FI = CI->getCalledFunction())) {

+ if (Function *FI = CI->getCalledFunction())

IID = FI->getIntrinsicID();

- }

+ Value *ScalarArg = nullptr;

std::vector<Value *> OpVecs;

for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {

ValueList OpVL;

@@ -3926,20 +4156,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::ShuffleVector: {

- assert(S.isAltShuffle() &&

- ((Instruction::isBinaryOp(S.getOpcode()) &&

- Instruction::isBinaryOp(S.getAltOpcode())) ||

- (Instruction::isCast(S.getOpcode()) &&

- Instruction::isCast(S.getAltOpcode()))) &&

+ assert(E->isAltShuffle() &&

+ ((Instruction::isBinaryOp(E->getOpcode()) &&

+ Instruction::isBinaryOp(E->getAltOpcode())) ||

+ (Instruction::isCast(E->getOpcode()) &&

+ Instruction::isCast(E->getAltOpcode()))) &&

"Invalid Shuffle Vector Operand");

- Value *LHS, *RHS;

- if (Instruction::isBinaryOp(S.getOpcode())) {

- setInsertPointAfterBundle(E->Scalars, S);

+ Value *LHS = nullptr, *RHS = nullptr;

+ if (Instruction::isBinaryOp(E->getOpcode())) {

+ setInsertPointAfterBundle(E);

LHS = vectorizeTree(E->getOperand(0));

RHS = vectorizeTree(E->getOperand(1));

} else {

- setInsertPointAfterBundle(E->Scalars, S);

+ setInsertPointAfterBundle(E);

LHS = vectorizeTree(E->getOperand(0));

}

@@ -3949,16 +4179,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *V0, *V1;

- if (Instruction::isBinaryOp(S.getOpcode())) {

+ if (Instruction::isBinaryOp(E->getOpcode())) {

V0 = Builder.CreateBinOp(

- static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS);

+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);

V1 = Builder.CreateBinOp(

- static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS);

+ static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);

} else {

V0 = Builder.CreateCast(

- static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy);

+ static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);

V1 = Builder.CreateCast(

- static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy);

+ static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);

}

// Create shuffle to take alternate operations from the vector.

@@ -3969,8 +4199,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

SmallVector<Constant *, 8> Mask(e);

for (unsigned i = 0; i < e; ++i) {

auto *OpInst = cast<Instruction>(E->Scalars[i]);

- assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");

- if (OpInst->getOpcode() == S.getAltOpcode()) {

+ assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");

+ if (OpInst->getOpcode() == E->getAltOpcode()) {

Mask[i] = Builder.getInt32(e + i);

AltScalars.push_back(E->Scalars[i]);

} else {

@@ -4136,20 +4366,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

Value *Scalar = Entry->Scalars[Lane];

+#ifndef NDEBUG

Type *Ty = Scalar->getType();

if (!Ty->isVoidTy()) {

-#ifndef NDEBUG

for (User *U : Scalar->users()) {

LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");

- // It is legal to replace users in the ignorelist by undef.

+ // It is legal to delete users in the ignorelist.

assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&

- "Replacing out-of-tree value with undef");

+ "Deleting out-of-tree value");

}

-#endif

- Value *Undef = UndefValue::get(Ty);

- Scalar->replaceAllUsesWith(Undef);

}

+#endif

LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");

eraseInstruction(cast<Instruction>(Scalar));

}

@@ -4165,7 +4393,7 @@ void BoUpSLP::optimizeGatherSequence() {

<< " gather sequences instructions.\n");

// LICM InsertElementInst sequences.

for (Instruction *I : GatherSeq) {

- if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))

+ if (isDeleted(I))

continue;

// Check if this block is inside a loop.

@@ -4219,6 +4447,8 @@ void BoUpSLP::optimizeGatherSequence() {

// For all instructions in blocks containing gather sequences:

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {

Instruction *In = &*it++;

+ if (isDeleted(In))

+ continue;

if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))

continue;

@@ -4245,11 +4475,11 @@ void BoUpSLP::optimizeGatherSequence() {

// Groups the instructions to a bundle (which is then a single scheduling entity)

// and schedules instructions until the bundle gets ready.

-bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,

- BoUpSLP *SLP,

- const InstructionsState &S) {

+Optional<BoUpSLP::ScheduleData *>

+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

+ const InstructionsState &S) {

if (isa<PHINode>(S.OpValue))

- return true;

+ return nullptr;

// Initialize the instruction bundle.

Instruction *OldScheduleEnd = ScheduleEnd;

@@ -4262,7 +4492,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,

// instructions of the bundle.

for (Value *V : VL) {

if (!extendSchedulingRegion(V, S))

- return false;

+ return None;

}

for (Value *V : VL) {

@@ -4308,6 +4538,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,

resetSchedule();

initialFillReadyList(ReadyInsts);

}

+ assert(Bundle && "Failed to find schedule bundle");

LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "

<< BB->getName() << "\n");

@@ -4329,9 +4560,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,

}

if (!Bundle->isReady()) {

cancelScheduling(VL, S.OpValue);

- return false;

+ return None;

}

- return true;

+ return Bundle;

}

void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

@@ -4364,7 +4595,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {

// Allocate a new ScheduleData for the instruction.

if (ChunkPos >= ChunkSize) {

- ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));

+ ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));

ChunkPos = 0;

}

return &(ScheduleDataChunks.back()[ChunkPos++]);

@@ -4977,7 +5208,7 @@ struct SLPVectorizer : public FunctionPass {

auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();

auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

- auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;

+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;

auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

@@ -5052,7 +5283,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

// If the target claims to have no vector registers don't attempt

// vectorization.

- if (!TTI->getNumberOfRegisters(true))

+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))

return false;

// Don't vectorize when the attribute NoImplicitFloat is used.

@@ -5100,19 +5331,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

return Changed;

}

-/// Check that the Values in the slice in VL array are still existent in

-/// the WeakTrackingVH array.

-/// Vectorization of part of the VL array may cause later values in the VL array

-/// to become invalid. We track when this has happened in the WeakTrackingVH

-/// array.

-static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,

- ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,

- unsigned SliceSize) {

- VL = VL.slice(SliceBegin, SliceSize);

- VH = VH.slice(SliceBegin, SliceSize);

- return !std::equal(VL.begin(), VL.end(), VH.begin());

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

unsigned VecRegSize) {

const unsigned ChainLen = Chain.size();

@@ -5124,20 +5342,20 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

if (!isPowerOf2_32(Sz) || VF < 2)

return false;

- // Keep track of values that were deleted by vectorizing in the loop below.

- const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());

bool Changed = false;

// Look for profitable vectorizable trees at all offsets, starting at zero.

for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) {

+ ArrayRef<Value *> Operands = Chain.slice(i, VF);

// Check that a previous iteration of this loop did not delete the Value.

- if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))

+ if (llvm::any_of(Operands, [&R](Value *V) {

+ auto *I = dyn_cast<Instruction>(V);

+ return I && R.isDeleted(I);

+ }))

continue;

LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i

<< "\n");

- ArrayRef<Value *> Operands = Chain.slice(i, VF);

R.buildTree(Operands);

if (R.isTreeTinyAndNotFullyVectorizable())

@@ -5329,12 +5547,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

bool CandidateFound = false;

int MinCost = SLPCostThreshold;

- // Keep track of values that were deleted by vectorizing in the loop below.

- SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());

unsigned NextInst = 0, MaxInst = VL.size();

- for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;

- VF /= 2) {

+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {

// No actual vectorization should happen, if number of parts is the same as

// provided vectorization factor (i.e. the scalar type is used for vector

// code during codegen).

@@ -5352,13 +5566,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)

break;

+ ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

// Check that a previous iteration of this loop did not delete the Value.

- if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))

+ if (llvm::any_of(Ops, [&R](Value *V) {

+ auto *I = dyn_cast<Instruction>(V);

+ return I && R.isDeleted(I);

+ }))

continue;

LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "

<< "\n");

- ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

R.buildTree(Ops);

Optional<ArrayRef<unsigned>> Order = R.bestOrder();

@@ -5571,7 +5788,7 @@ class HorizontalReduction {

Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {

assert(isVectorizable() &&

"Expected add|fadd or min/max reduction operation.");

- Value *Cmp;

+ Value *Cmp = nullptr;

switch (Kind) {

case RK_Arithmetic:

return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,

@@ -5579,23 +5796,23 @@ class HorizontalReduction {

case RK_Min:

Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)

: Builder.CreateFCmpOLT(LHS, RHS);

- break;

+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);

case RK_Max:

Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)

: Builder.CreateFCmpOGT(LHS, RHS);

- break;

+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);

case RK_UMin:

assert(Opcode == Instruction::ICmp && "Expected integer types.");

Cmp = Builder.CreateICmpULT(LHS, RHS);

- break;

+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);

case RK_UMax:

assert(Opcode == Instruction::ICmp && "Expected integer types.");

Cmp = Builder.CreateICmpUGT(LHS, RHS);

- break;

+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);

case RK_None:

- llvm_unreachable("Unknown reduction operation.");

+ break;

}

- return Builder.CreateSelect(Cmp, LHS, RHS, Name);

+ llvm_unreachable("Unknown reduction operation.");

}

public:

@@ -6203,6 +6420,8 @@ public:

}

if (V.isTreeTinyAndNotFullyVectorizable())

break;

+ if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))

+ break;

V.computeMinimumValueSizes();

@@ -6275,6 +6494,9 @@ public:

}

// Update users.

ReductionRoot->replaceAllUsesWith(VectorizedTree);

+ // Mark all scalar reduction ops for deletion, they are replaced by the

+ // vector reductions.

+ V.eraseInstructions(IgnoreList);

}

return VectorizedTree != nullptr;

}

@@ -6323,7 +6545,7 @@ private:

IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;

int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

- int ScalarReduxCost;

+ int ScalarReduxCost = 0;

switch (ReductionData.getKind()) {

case RK_Arithmetic:

ScalarReduxCost =

@@ -6429,10 +6651,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,

/// \return true if it matches.

static bool findBuildAggregate(InsertValueInst *IV,

SmallVectorImpl<Value *> &BuildVectorOpds) {

- Value *V;

do {

BuildVectorOpds.push_back(IV->getInsertedValueOperand());

- V = IV->getAggregateOperand();

+ Value *V = IV->getAggregateOperand();

if (isa<UndefValue>(V))

break;

IV = dyn_cast<InsertValueInst>(V);

@@ -6530,18 +6751,13 @@ static bool tryToVectorizeHorReductionOrInstOperands(

// horizontal reduction.

// Interrupt the process if the Root instruction itself was vectorized or all

// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

- SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});

+ SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});

SmallPtrSet<Value *, 8> VisitedInstrs;

bool Res = false;

while (!Stack.empty()) {

- Value *V;

+ Instruction *Inst;

unsigned Level;

- std::tie(V, Level) = Stack.pop_back_val();

- if (!V)

- continue;

- auto *Inst = dyn_cast<Instruction>(V);

- if (!Inst)

- continue;

+ std::tie(Inst, Level) = Stack.pop_back_val();

auto *BI = dyn_cast<BinaryOperator>(Inst);

auto *SI = dyn_cast<SelectInst>(Inst);

if (BI || SI) {

@@ -6582,8 +6798,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(

for (auto *Op : Inst->operand_values())

if (VisitedInstrs.insert(Op).second)

if (auto *I = dyn_cast<Instruction>(Op))

- if (!isa<PHINode>(I) && I->getParent() == BB)

- Stack.emplace_back(Op, Level);

+ if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)

+ Stack.emplace_back(I, Level);

}

return Res;

}

@@ -6652,11 +6868,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,

}

bool SLPVectorizerPass::vectorizeSimpleInstructions(

- SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {

+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {

bool OpsChanged = false;

- for (auto &VH : reverse(Instructions)) {

- auto *I = dyn_cast_or_null<Instruction>(VH);

- if (!I)

+ for (auto *I : reverse(Instructions)) {

+ if (R.isDeleted(I))

continue;

if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))

OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);

@@ -6685,7 +6900,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

if (!P)

break;

- if (!VisitedInstrs.count(P))

+ if (!VisitedInstrs.count(P) && !R.isDeleted(P))

Incoming.push_back(P);

}

@@ -6729,9 +6944,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

VisitedInstrs.clear();

- SmallVector<WeakVH, 8> PostProcessInstructions;

+ SmallVector<Instruction *, 8> PostProcessInstructions;

SmallDenseSet<Instruction *, 4> KeyNodes;

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

+ // Skip instructions marked for the deletion.

+ if (R.isDeleted(&*it))

+ continue;

// We may go through BB multiple times so skip the one we have checked.

if (!VisitedInstrs.insert(&*it).second) {

if (it->use_empty() && KeyNodes.count(&*it) > 0 &&

@@ -6811,10 +7029,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {

LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "

<< Entry.second.size() << ".\n");

- // We process the getelementptr list in chunks of 16 (like we do for

- // stores) to minimize compile-time.

- for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {

- auto Len = std::min<unsigned>(BE - BI, 16);

+ // Process the GEP list in chunks suitable for the target's supported

+ // vector size. If a vector register can't hold 1 element, we are done.

+ unsigned MaxVecRegSize = R.getMaxVecRegSize();

+ unsigned EltSize = R.getVectorElementSize(Entry.second[0]);

+ if (MaxVecRegSize < EltSize)

+ continue;

+ unsigned MaxElts = MaxVecRegSize / EltSize;

+ for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {

+ auto Len = std::min<unsigned>(BE - BI, MaxElts);

auto GEPList = makeArrayRef(&Entry.second[BI], Len);

// Initialize a set a candidate getelementptrs. Note that we use a

@@ -6824,10 +7048,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {

SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

// Some of the candidates may have already been vectorized after we

- // initially collected them. If so, the WeakTrackingVHs will have

- // nullified the

- // values, so remove them from the set of candidates.

- Candidates.remove(nullptr);

+ // initially collected them. If so, they are marked as deleted, so remove

+ // them from the set of candidates.

+ Candidates.remove_if(

+ [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });

// Remove from the set of candidates all pairs of getelementptrs with

// constant differences. Such getelementptrs are likely not good

@@ -6835,18 +7059,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {

// computed from the other. We also ensure all candidate getelementptr

// indices are unique.

for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {

- auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);

+ auto *GEPI = GEPList[I];

if (!Candidates.count(GEPI))

continue;

auto *SCEVI = SE->getSCEV(GEPList[I]);

for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {

- auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);

+ auto *GEPJ = GEPList[J];

auto *SCEVJ = SE->getSCEV(GEPList[J]);

if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {

- Candidates.remove(GEPList[I]);

- Candidates.remove(GEPList[J]);

+ Candidates.remove(GEPI);

+ Candidates.remove(GEPJ);

} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {

- Candidates.remove(GEPList[J]);

+ Candidates.remove(GEPJ);

}

diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 517d759d7bfc..4b80d1fb20aa 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp

@@ -283,6 +283,12 @@ iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {

return getParent()->getRecipeList().erase(getIterator());

}

+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {

+ InsertPos->getParent()->getRecipeList().splice(

+ std::next(InsertPos->getIterator()), getParent()->getRecipeList(),

+ getIterator());

void VPInstruction::generateInstruction(VPTransformState &State,

unsigned Part) {

IRBuilder<> &Builder = State.Builder;

@@ -309,6 +315,14 @@ void VPInstruction::generateInstruction(VPTransformState &State,

State.set(this, V, Part);

break;

}

+ case Instruction::Select: {

+ Value *Cond = State.get(getOperand(0), Part);

+ Value *Op1 = State.get(getOperand(1), Part);

+ Value *Op2 = State.get(getOperand(2), Part);

+ Value *V = Builder.CreateSelect(Cond, Op1, Op2);

+ State.set(this, V, Part);

+ break;

+ }

default:

llvm_unreachable("Unsupported opcode for instruction");

}

@@ -728,7 +742,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,

auto NewIGIter = Old2New.find(IG);

if (NewIGIter == Old2New.end())

Old2New[IG] = new InterleaveGroup<VPInstruction>(

- IG->getFactor(), IG->isReverse(), IG->getAlignment());

+ IG->getFactor(), IG->isReverse(), Align(IG->getAlignment()));

if (Inst == IG->getInsertPos())

Old2New[IG]->setInsertPos(VPInst);

@@ -736,7 +750,8 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,

InterleaveGroupMap[VPInst] = Old2New[IG];

InterleaveGroupMap[VPInst]->insertMember(

VPInst, IG->getIndex(Inst),

- IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor());

+ Align(IG->isReverse() ? (-1) * int(IG->getFactor())

+ : IG->getFactor()));

}

} else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))

visitRegion(Region, Old2New, IAI);

diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 8a06412ad590..44d8a198f27e 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h

@@ -615,6 +615,10 @@ public:

/// the specified recipe.

void insertBefore(VPRecipeBase *InsertPos);

+ /// Unlink this recipe from its current VPBasicBlock and insert it into

+ /// the VPBasicBlock that MovePos lives in, right after MovePos.

+ void moveAfter(VPRecipeBase *MovePos);

/// This method unlinks 'this' from the containing basic block and deletes it.

///

/// \returns an iterator pointing to the element after the erased one

diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
index 7ed7d21b6caa..b22d3190d654 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp

@@ -21,7 +21,7 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes(

LoopVectorizationLegality::InductionList *Inductions,

SmallPtrSetImpl<Instruction *> &DeadInstructions) {

- VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry());

+ auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());

ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());

// Condition bit VPValues get deleted during transformation to VPRecipes.

diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp
index e5ab24e52df6..9019ed15ec5f 100644
--- a/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/lib/Transforms/Vectorize/VPlanSLP.cpp

@@ -346,11 +346,14 @@ SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {

void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {

dbgs() << " Ops: ";

- for (auto Op : Values)

- if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())

- dbgs() << *Instr << " | ";

- else

- dbgs() << " nullptr | ";

+ for (auto Op : Values) {

+ if (auto *VPInstr = cast_or_null<VPInstruction>(Op))

+ if (auto *Instr = VPInstr->getUnderlyingInstr()) {

+ dbgs() << *Instr << " | ";

+ continue;

+ }

+ dbgs() << " nullptr | ";

+ }

dbgs() << "\n";

}