src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
commit	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree	599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Transforms/Vectorize
parent	1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)

vendor/llvm/llvm-trunk-r366426

Diffstat (limited to 'lib/Transforms/Vectorize')

-rw-r--r--

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

347

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorizationPlanner.h

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorize.cpp

476

-rw-r--r--

lib/Transforms/Vectorize/SLPVectorizer.cpp

1362

-rw-r--r--

lib/Transforms/Vectorize/VPRecipeBuilder.h

-rw-r--r--

lib/Transforms/Vectorize/VPlan.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlan.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanDominatorTree.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlanHCFGBuilder.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlanHCFGTransforms.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanLoopInfo.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanPredicator.cpp

248

-rw-r--r--

lib/Transforms/Vectorize/VPlanPredicator.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanSLP.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlanValue.h

-rw-r--r--

lib/Transforms/Vectorize/VPlanVerifier.cpp

-rw-r--r--

lib/Transforms/Vectorize/VPlanVerifier.h

-rw-r--r--

lib/Transforms/Vectorize/Vectorize.cpp

21 files changed, 1856 insertions, 873 deletions

diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 9ff18328c219..4273080ddd91 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

@@ -1,9 +1,8 @@

//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -927,7 +926,7 @@ bool Vectorizer::vectorizeStoreChain(

StoreInst *S0 = cast<StoreInst>(Chain[0]);

// If the vector has an int element, default to int for the whole store.

- Type *StoreTy;

+ Type *StoreTy = nullptr;

for (Instruction *I : Chain) {

StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();

if (StoreTy->isIntOrIntVectorTy())

@@ -939,6 +938,7 @@ bool Vectorizer::vectorizeStoreChain(

break;

}

+ assert(StoreTy && "Failed to find store type");

unsigned Sz = DL.getTypeSizeInBits(StoreTy);

unsigned AS = S0->getPointerAddressSpace();

@@ -1152,13 +1152,8 @@ bool Vectorizer::vectorizeLoadChain(

vectorizeLoadChain(Chains.second, InstructionsProcessed);

}

- unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),

- StackAdjustedAlignment,

- DL, L0, nullptr, &DT);

- if (NewAlign != 0)

- Alignment = NewAlign;

+ Alignment = getOrEnforceKnownAlignment(

+ L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);

}

if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {

@@ -1182,7 +1177,7 @@ bool Vectorizer::vectorizeLoadChain(

Value *Bitcast =

Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));

- LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);

+ LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);

propagateMetadata(LI, Chain);

if (VecLoadTy) {

diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index b44fe5a52a2f..6ef8dc2d3cd7 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -1,9 +1,8 @@

//===- LoopVectorizationLegality.cpp --------------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -23,6 +22,8 @@ using namespace llvm;

#define LV_NAME "loop-vectorize"

#define DEBUG_TYPE LV_NAME

+extern cl::opt<bool> EnableVPlanPredication;

static cl::opt<bool>

EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,

cl::desc("Enable if-conversion during vectorization."));

@@ -46,6 +47,18 @@ static const unsigned MaxInterleaveFactor = 16;

namespace llvm {

+#ifndef NDEBUG

+static void debugVectorizationFailure(const StringRef DebugMsg,

+ Instruction *I) {

+ dbgs() << "LV: Not vectorizing: " << DebugMsg;

+ if (I != nullptr)

+ dbgs() << " " << *I;

+ else

+ dbgs() << '.';

+ dbgs() << '\n';

+#endif

OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,

StringRef RemarkName,

Loop *TheLoop,

@@ -103,6 +116,25 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,

<< "LV: Interleaving disabled by the pass manager\n");

}

+void LoopVectorizeHints::setAlreadyVectorized() {

+ LLVMContext &Context = TheLoop->getHeader()->getContext();

+ MDNode *IsVectorizedMD = MDNode::get(

+ Context,

+ {MDString::get(Context, "llvm.loop.isvectorized"),

+ ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});

+ MDNode *LoopID = TheLoop->getLoopID();

+ MDNode *NewLoopID =

+ makePostTransformationMetadata(Context, LoopID,

+ {Twine(Prefix(), "vectorize.").str(),

+ Twine(Prefix(), "interleave.").str()},

+ {IsVectorizedMD});

+ TheLoop->setLoopID(NewLoopID);

+ // Update internal cache.

+ IsVectorized.Value = 1;

bool LoopVectorizeHints::allowVectorization(

Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {

if (getForce() == LoopVectorizeHints::FK_Disabled) {

@@ -230,57 +262,6 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {

}

-MDNode *LoopVectorizeHints::createHintMetadata(StringRef Name,

- unsigned V) const {

- LLVMContext &Context = TheLoop->getHeader()->getContext();

- Metadata *MDs[] = {

- MDString::get(Context, Name),

- ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};

- return MDNode::get(Context, MDs);

-bool LoopVectorizeHints::matchesHintMetadataName(MDNode *Node,

- ArrayRef<Hint> HintTypes) {

- MDString *Name = dyn_cast<MDString>(Node->getOperand(0));

- if (!Name)

- return false;

- for (auto H : HintTypes)

- if (Name->getString().endswith(H.Name))

- return true;

- return false;

-void LoopVectorizeHints::writeHintsToMetadata(ArrayRef<Hint> HintTypes) {

- if (HintTypes.empty())

- return;

- // Reserve the first element to LoopID (see below).

- SmallVector<Metadata *, 4> MDs(1);

- // If the loop already has metadata, then ignore the existing operands.

- MDNode *LoopID = TheLoop->getLoopID();

- if (LoopID) {

- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {

- MDNode *Node = cast<MDNode>(LoopID->getOperand(i));

- // If node in update list, ignore old value.

- if (!matchesHintMetadataName(Node, HintTypes))

- MDs.push_back(Node);

- }

- // Now, add the missing hints.

- for (auto H : HintTypes)

- MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));

- // Replace current metadata node with new one.

- LLVMContext &Context = TheLoop->getHeader()->getContext();

- MDNode *NewLoopID = MDNode::get(Context, MDs);

- // Set operand 0 to refer to the loop id itself.

- NewLoopID->replaceOperandWith(0, NewLoopID);

- TheLoop->setLoopID(NewLoopID);

bool LoopVectorizationRequirements::doesNotMeet(

Function *F, Loop *L, const LoopVectorizeHints &Hints) {

const char *PassName = Hints.vectorizeAnalysisPassName();

@@ -464,6 +445,14 @@ bool LoopVectorizationLegality::isUniform(Value *V) {

return LAI->isUniform(V);

}

+void LoopVectorizationLegality::reportVectorizationFailure(

+ const StringRef DebugMsg, const StringRef OREMsg,

+ const StringRef ORETag, Instruction *I) const {

+ LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));

+ ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),

+ ORETag, TheLoop, I) << OREMsg);

bool LoopVectorizationLegality::canVectorizeOuterLoop() {

assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");

// Store the result and return it at the end instead of exiting early, in case

@@ -476,9 +465,9 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

// not supported yet.

auto *Br = dyn_cast<BranchInst>(BB->getTerminator());

if (!Br) {

- LLVM_DEBUG(dbgs() << "LV: Unsupported basic block terminator.\n");

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("Unsupported basic block terminator",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -488,13 +477,16 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

// Check whether the BranchInst is a supported one. Only unconditional

// branches, conditional branches with an outer loop invariant condition or

// backedges are supported.

- if (Br && Br->isConditional() &&

+ // FIXME: We skip these checks when VPlan predication is enabled as we

+ // want to allow divergent branches. This whole check will be removed

+ // once VPlan predication is on by default.

+ if (!EnableVPlanPredication && Br && Br->isConditional() &&

!TheLoop->isLoopInvariant(Br->getCondition()) &&

!LI->isLoopHeader(Br->getSuccessor(0)) &&

!LI->isLoopHeader(Br->getSuccessor(1))) {

- LLVM_DEBUG(dbgs() << "LV: Unsupported conditional branch.\n");

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("Unsupported conditional branch",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -506,11 +498,9 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

// simple outer loops scenarios with uniform nested loops.

if (!isUniformLoopNest(TheLoop /*loop nest*/,

TheLoop /*context outer loop*/)) {

- LLVM_DEBUG(

- dbgs()

- << "LV: Not vectorizing: Outer loop contains divergent loops.\n");

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("Outer loop contains divergent loops",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -519,10 +509,9 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {

// Check whether we are able to set up outer loop induction.

if (!setupOuterLoopInductions()) {

- LLVM_DEBUG(

- dbgs() << "LV: Not vectorizing: Unsupported outer loop Phi(s).\n");

- ORE->emit(createMissedAnalysis("UnsupportedPhi")

- << "Unsupported outer loop Phi(s)");

+ reportVectorizationFailure("Unsupported outer loop Phi(s)",

+ "Unsupported outer loop Phi(s)",

+ "UnsupportedPhi");

if (DoExtraAnalysis)

Result = false;

else

@@ -627,9 +616,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// Check that this PHI type is allowed.

if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&

!PhiTy->isPointerTy()) {

- ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)

- << "loop control flow is not understood by vectorizer");

- LLVM_DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");

+ reportVectorizationFailure("Found a non-int non-pointer PHI",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

return false;

}

@@ -647,9 +636,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// We only allow if-converted PHIs with exactly two incoming values.

if (Phi->getNumIncomingValues() != 2) {

- ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)

- << "control flow not understood by vectorizer");

- LLVM_DEBUG(dbgs() << "LV: Found an invalid PHI.\n");

+ reportVectorizationFailure("Found an invalid PHI",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood", Phi);

return false;

}

@@ -698,10 +687,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

continue;

}

- ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)

- << "value that could not be identified as "

- "reduction is used outside the loop");

- LLVM_DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");

+ reportVectorizationFailure("Found an unidentified PHI",

+ "value that could not be identified as "

+ "reduction is used outside the loop",

+ "NonReductionValueUsedOutsideLoop", Phi);

return false;

} // end of PHI handling

@@ -728,31 +717,33 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// but it's hard to provide meaningful yet generic advice.

// Also, should this be guarded by allowExtraAnalysis() and/or be part

// of the returned info from isFunctionVectorizable()?

- ORE->emit(createMissedAnalysis("CantVectorizeLibcall", CI)

- << "library call cannot be vectorized. "

- "Try compiling with -fno-math-errno, -ffast-math, "

- "or similar flags");

+ reportVectorizationFailure("Found a non-intrinsic callsite",

+ "library call cannot be vectorized. "

+ "Try compiling with -fno-math-errno, -ffast-math, "

+ "or similar flags",

+ "CantVectorizeLibcall", CI);

} else {

- ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)

- << "call instruction cannot be vectorized");

+ reportVectorizationFailure("Found a non-intrinsic callsite",

+ "call instruction cannot be vectorized",

+ "CantVectorizeLibcall", CI);

}

- LLVM_DEBUG(

- dbgs() << "LV: Found a non-intrinsic callsite.\n");

return false;

}

- // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the

- // second argument is the same (i.e. loop invariant)

- if (CI && hasVectorInstrinsicScalarOpd(

- getVectorIntrinsicIDForCall(CI, TLI), 1)) {

+ // Some intrinsics have scalar arguments and should be same in order for

+ // them to be vectorized (i.e. loop invariant).

+ if (CI) {

auto *SE = PSE.getSE();

- if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {

- ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)

- << "intrinsic instruction cannot be vectorized");

- LLVM_DEBUG(dbgs()

- << "LV: Found unvectorizable intrinsic " << *CI << "\n");

- return false;

- }

+ Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);

+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)

+ if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {

+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {

+ reportVectorizationFailure("Found unvectorizable intrinsic",

+ "intrinsic instruction cannot be vectorized",

+ "CantVectorizeIntrinsic", CI);

+ return false;

+ }

}

// Check that the instruction return type is vectorizable.

@@ -760,9 +751,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

if ((!VectorType::isValidElementType(I.getType()) &&

!I.getType()->isVoidTy()) ||

isa<ExtractElementInst>(I)) {

- ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)

- << "instruction return type cannot be vectorized");

- LLVM_DEBUG(dbgs() << "LV: Found unvectorizable type.\n");

+ reportVectorizationFailure("Found unvectorizable type",

+ "instruction return type cannot be vectorized",

+ "CantVectorizeInstructionReturnType", &I);

return false;

}

@@ -770,11 +761,44 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

if (auto *ST = dyn_cast<StoreInst>(&I)) {

Type *T = ST->getValueOperand()->getType();

if (!VectorType::isValidElementType(T)) {

- ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)

- << "store instruction cannot be vectorized");

+ reportVectorizationFailure("Store instruction cannot be vectorized",

+ "store instruction cannot be vectorized",

+ "CantVectorizeStore", ST);

return false;

}

+ // For nontemporal stores, check that a nontemporal vector version is

+ // supported on the target.

+ if (ST->getMetadata(LLVMContext::MD_nontemporal)) {

+ // Arbitrarily try a vector of 2 elements.

+ Type *VecTy = VectorType::get(T, /*NumElements=*/2);

+ assert(VecTy && "did not find vectorized version of stored type");

+ unsigned Alignment = getLoadStoreAlignment(ST);

+ if (!TTI->isLegalNTStore(VecTy, Alignment)) {

+ reportVectorizationFailure(

+ "nontemporal store instruction cannot be vectorized",

+ "CantVectorizeNontemporalStore", ST);

+ return false;

+ }

+ } else if (auto *LD = dyn_cast<LoadInst>(&I)) {

+ if (LD->getMetadata(LLVMContext::MD_nontemporal)) {

+ // For nontemporal loads, check that a nontemporal vector version is

+ // supported on the target (arbitrarily try a vector of 2 elements).

+ Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);

+ assert(VecTy && "did not find vectorized version of load type");

+ unsigned Alignment = getLoadStoreAlignment(LD);

+ if (!TTI->isLegalNTLoad(VecTy, Alignment)) {

+ reportVectorizationFailure(

+ "nontemporal load instruction cannot be vectorized",

+ "CantVectorizeNontemporalLoad", LD);

+ return false;

+ }

// FP instructions can allow unsafe algebra, thus vectorizable by

// non-IEEE-754 compliant SIMD units.

// This applies to floating-point math operations and calls, not memory

@@ -797,23 +821,27 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

AllowedExit.insert(&I);

continue;

}

- ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)

- << "value cannot be used outside the loop");

+ reportVectorizationFailure("Value cannot be used outside the loop",

+ "value cannot be used outside the loop",

+ "ValueUsedOutsideLoop", &I);

return false;

}

} // next instr.

}

if (!PrimaryInduction) {

- LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");

if (Inductions.empty()) {

- ORE->emit(createMissedAnalysis("NoInductionVariable")

- << "loop induction variable could not be identified");

+ reportVectorizationFailure("Did not find one integer induction var",

+ "loop induction variable could not be identified",

+ "NoInductionVariable");

return false;

} else if (!WidestIndTy) {

- ORE->emit(createMissedAnalysis("NoIntegerInductionVariable")

- << "integer loop induction variable could not be identified");

+ reportVectorizationFailure("Did not find one integer induction var",

+ "integer loop induction variable could not be identified",

+ "NoIntegerInductionVariable");

return false;

+ } else {

+ LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");

}

@@ -839,11 +867,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {

return false;

if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {

- ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")

- << "write to a loop invariant address could not "

- "be vectorized");

- LLVM_DEBUG(

- dbgs() << "LV: Non vectorizable stores to a uniform address\n");

+ reportVectorizationFailure("Stores to a uniform address",

+ "write to a loop invariant address could not be vectorized",

+ "CantVectorizeStoreToLoopInvariantAddress");

return false;

}

Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());

@@ -925,8 +951,9 @@ bool LoopVectorizationLegality::blockCanBePredicated(

bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

if (!EnableIfConversion) {

- ORE->emit(createMissedAnalysis("IfConversionDisabled")

- << "if-conversion is disabled");

+ reportVectorizationFailure("If-conversion is disabled",

+ "if-conversion is disabled",

+ "IfConversionDisabled");

return false;

}

@@ -950,21 +977,26 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

for (BasicBlock *BB : TheLoop->blocks()) {

// We don't support switch statements inside loops.

if (!isa<BranchInst>(BB->getTerminator())) {

- ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())

- << "loop contains a switch statement");

+ reportVectorizationFailure("Loop contains a switch statement",

+ "loop contains a switch statement",

+ "LoopContainsSwitch", BB->getTerminator());

return false;

}

// We must be able to predicate all blocks that need to be predicated.

if (blockNeedsPredication(BB)) {

if (!blockCanBePredicated(BB, SafePointes)) {

- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())

- << "control flow cannot be substituted for a select");

+ reportVectorizationFailure(

+ "Control flow cannot be substituted for a select",

+ "control flow cannot be substituted for a select",

+ "NoCFGForSelect", BB->getTerminator());

return false;

}

} else if (BB != Header && !canIfConvertPHINodes(BB)) {

- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())

- << "control flow cannot be substituted for a select");

+ reportVectorizationFailure(

+ "Control flow cannot be substituted for a select",

+ "control flow cannot be substituted for a select",

+ "NoCFGForSelect", BB->getTerminator());

return false;

}

@@ -992,9 +1024,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

// We must have a loop in canonical form. Loops with indirectbr in them cannot

// be canonicalized.

if (!Lp->getLoopPreheader()) {

- LLVM_DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("Loop doesn't have a legal pre-header",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -1003,8 +1035,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

// We must have a single backedge.

if (Lp->getNumBackEdges() != 1) {

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("The loop must have a single backedge",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -1013,8 +1046,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

// We must have a single exiting block.

if (!Lp->getExitingBlock()) {

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("The loop must have an exiting block",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -1025,8 +1059,9 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,

// checked at the end of each iteration. With that we can assume that all

// instructions in the loop are executed the same number of times.

if (Lp->getExitingBlock() != Lp->getLoopLatch()) {

- ORE->emit(createMissedAnalysis("CFGNotUnderstood")

- << "loop control flow is not understood by vectorizer");

+ reportVectorizationFailure("The exiting block is not the loop latch",

+ "loop control flow is not understood by vectorizer",

+ "CFGNotUnderstood");

if (DoExtraAnalysis)

Result = false;

else

@@ -1087,7 +1122,9 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {

assert(UseVPlanNativePath && "VPlan-native path is not enabled.");

if (!canVectorizeOuterLoop()) {

- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Unsupported outer loop.\n");

+ reportVectorizationFailure("Unsupported outer loop",

+ "unsupported outer loop",

+ "UnsupportedOuterLoop");

// TODO: Implement DoExtraAnalysis when subsequent legal checks support

// outer loops.

return false;

@@ -1137,10 +1174,9 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {

SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;

if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {

- ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")

- << "Too many SCEV assumptions need to be made and checked "

- << "at runtime");

- LLVM_DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");

+ reportVectorizationFailure("Too many SCEV checks needed",

+ "Too many SCEV assumptions need to be made and checked at runtime",

+ "TooManySCEVRunTimeChecks");

if (DoExtraAnalysis)

Result = false;

else

@@ -1159,20 +1195,20 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {

LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");

if (!PrimaryInduction) {

- ORE->emit(createMissedAnalysis("NoPrimaryInduction")

- << "Missing a primary induction variable in the loop, which is "

- << "needed in order to fold tail by masking as required.");

- LLVM_DEBUG(dbgs() << "LV: No primary induction, cannot fold tail by "

- << "masking.\n");

+ reportVectorizationFailure(

+ "No primary induction, cannot fold tail by masking",

+ "Missing a primary induction variable in the loop, which is "

+ "needed in order to fold tail by masking as required.",

+ "NoPrimaryInduction");

return false;

}

// TODO: handle reductions when tail is folded by masking.

if (!Reductions.empty()) {

- ORE->emit(createMissedAnalysis("ReductionFoldingTailByMasking")

- << "Cannot fold tail by masking in the presence of reductions.");

- LLVM_DEBUG(dbgs() << "LV: Loop has reductions, cannot fold tail by "

- << "masking.\n");

+ reportVectorizationFailure(

+ "Loop has reductions, cannot fold tail by masking",

+ "Cannot fold tail by masking in the presence of reductions.",

+ "ReductionFoldingTailByMasking");

return false;

}

@@ -1183,10 +1219,10 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {

Instruction *UI = cast<Instruction>(U);

if (TheLoop->contains(UI))

continue;

- ORE->emit(createMissedAnalysis("LiveOutFoldingTailByMasking")

- << "Cannot fold tail by masking in the presence of live outs.");

- LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop has an "

- << "outside user for : " << *UI << '\n');

+ reportVectorizationFailure(

+ "Cannot fold tail by masking, loop has an outside user for",

+ "Cannot fold tail by masking in the presence of live outs.",

+ "LiveOutFoldingTailByMasking", UI);

return false;

}

@@ -1198,9 +1234,10 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {

// do not need predication such as the header block.

for (BasicBlock *BB : TheLoop->blocks()) {

if (!blockCanBePredicated(BB, SafePointers)) {

- ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())

- << "control flow cannot be substituted for a select");

- LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as required.\n");

+ reportVectorizationFailure(

+ "Cannot fold tail by masking as required",

+ "control flow cannot be substituted for a select",

+ "NoCFGForSelect", BB->getTerminator());

return false;

}

diff --git a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 2aa219064299..97077cce83e3 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

@@ -1,9 +1,8 @@

//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

@@ -172,6 +171,13 @@ struct VectorizationFactor {

unsigned Width;

// Cost of the loop with that width

unsigned Cost;

+ // Width 1 means no vectorization, cost 0 means uncomputed cost.

+ static VectorizationFactor Disabled() { return {1, 0}; }

+ bool operator==(const VectorizationFactor &rhs) const {

+ return Width == rhs.Width && Cost == rhs.Cost;

+ }

};

/// Planner drives the vectorization process after having passed

@@ -192,11 +198,9 @@ class LoopVectorizationPlanner {

/// The legality analysis.

LoopVectorizationLegality *Legal;

- /// The profitablity analysis.

+ /// The profitability analysis.

LoopVectorizationCostModel &CM;

- using VPlanPtr = std::unique_ptr<VPlan>;

SmallVector<VPlanPtr, 4> VPlans;

/// This class is used to enable the VPlan to invoke a method of ILV. This is

@@ -222,8 +226,9 @@ public:

LoopVectorizationCostModel &CM)

: OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}

- /// Plan how to best vectorize, return the best VF and its cost.

- VectorizationFactor plan(bool OptForSize, unsigned UserVF);

+ /// Plan how to best vectorize, return the best VF and its cost, or None if

+ /// vectorization and interleaving should be avoided up front.

+ Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF);

/// Use the VPlan-native path to plan how to best vectorize, return the best

/// VF and its cost.

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c45dee590b84..46265e3f3e13 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -1,9 +1,8 @@

//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -57,8 +56,10 @@

#include "llvm/Transforms/Vectorize/LoopVectorize.h"

#include "LoopVectorizationPlanner.h"

#include "VPRecipeBuilder.h"

+#include "VPlan.h"

#include "VPlanHCFGBuilder.h"

#include "VPlanHCFGTransforms.h"

+#include "VPlanPredicator.h"

#include "llvm/ADT/APInt.h"

#include "llvm/ADT/ArrayRef.h"

#include "llvm/ADT/DenseMap.h"

@@ -86,7 +87,9 @@

#include "llvm/Analysis/LoopAnalysisManager.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/LoopIterator.h"

+#include "llvm/Analysis/MemorySSA.h"

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

+#include "llvm/Analysis/ProfileSummaryInfo.h"

#include "llvm/Analysis/ScalarEvolution.h"

#include "llvm/Analysis/ScalarEvolutionExpander.h"

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

@@ -133,6 +136,7 @@

#include "llvm/Transforms/Utils/LoopSimplify.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/LoopVersioning.h"

+#include "llvm/Transforms/Utils/SizeOpts.h"

#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

#include <algorithm>

#include <cassert>

@@ -256,6 +260,13 @@ cl::opt<bool> EnableVPlanNativePath(

cl::desc("Enable VPlan-native vectorization path with "

"support for outer loop vectorization."));

+// FIXME: Remove this switch once we have divergence analysis. Currently we

+// assume divergent non-backedge branches when this switch is true.

+cl::opt<bool> EnableVPlanPredication(

+ "enable-vplan-predication", cl::init(false), cl::Hidden,

+ cl::desc("Enable VPlan-native vectorization path predicator with "

+ "support for outer loop vectorization."));

// This flag enables the stress testing of the VPlan H-CFG construction in the

// VPlan-native vectorization path. It must be used in conjuction with

// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the

@@ -267,6 +278,13 @@ static cl::opt<bool> VPlanBuildStressTest(

"out right after the build (stress test the VPlan H-CFG construction "

"in the VPlan-native vectorization path)."));

+cl::opt<bool> llvm::EnableLoopInterleaving(

+ "interleave-loops", cl::init(true), cl::Hidden,

+ cl::desc("Enable loop interleaving in Loop vectorization passes"));

+cl::opt<bool> llvm::EnableLoopVectorization(

+ "vectorize-loops", cl::init(true), cl::Hidden,

+ cl::desc("Run the Loop vectorization passes"));

/// A helper function for converting Scalar types to vector types.

/// If the incoming type is void, we return void. If the VF is 1, we return

/// the scalar type.

@@ -311,11 +329,14 @@ static unsigned getReciprocalPredBlockProb() { return 2; }

/// A helper function that adds a 'fast' flag to floating-point operations.

static Value *addFastMathFlag(Value *V) {

- if (isa<FPMathOperator>(V)) {

- FastMathFlags Flags;

- Flags.setFast();

- cast<Instruction>(V)->setFastMathFlags(Flags);

- }

+ if (isa<FPMathOperator>(V))

+ cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());

+ return V;

+static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {

+ if (isa<FPMathOperator>(V))

+ cast<Instruction>(V)->setFastMathFlags(FMF);

return V;

}

@@ -760,7 +781,7 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)

const DILocation *DIL = Inst->getDebugLoc();

if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&

!isa<DbgInfoIntrinsic>(Inst)) {

- auto NewDIL = DIL->cloneWithDuplicationFactor(UF * VF);

+ auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);

if (NewDIL)

B.SetCurrentDebugLocation(NewDIL.getValue());

else

@@ -836,7 +857,7 @@ public:

AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}

/// \return An upper bound for the vectorization factor, or None if

- /// vectorization should be avoided up front.

+ /// vectorization and interleaving should be avoided up front.

Optional<unsigned> computeMaxVF(bool OptForSize);

/// \return The most profitable vectorization factor and the cost of that VF.

@@ -1149,6 +1170,18 @@ public:

return foldTailByMasking() || Legal->blockNeedsPredication(BB);

}

+ /// Estimate cost of an intrinsic call instruction CI if it were vectorized

+ /// with factor VF. Return the cost of the instruction, including

+ /// scalarization overhead if it's needed.

+ unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);

+ /// Estimate cost of a call instruction CI if it were vectorized with factor

+ /// VF. Return the cost of the instruction, including scalarization overhead

+ /// if it's needed. The flag NeedToScalarize shows if the call needs to be

+ /// scalarized -

+ /// i.e. either vector version isn't available, or is too expensive.

+ unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);

private:

unsigned NumPredStores = 0;

@@ -1201,6 +1234,10 @@ private:

/// element)

unsigned getUniformMemOpCost(Instruction *I, unsigned VF);

+ /// Estimate the overhead of scalarizing an instruction. This is a

+ /// convenience wrapper for the type-based getScalarizationOverhead API.

+ unsigned getScalarizationOverhead(Instruction *I, unsigned VF);

/// Returns whether the instruction is a load or store and will be a emitted

/// as a vector operation.

bool isConsecutiveLoadOrStore(Instruction *I);

@@ -1295,6 +1332,30 @@ private:

DecisionList WideningDecisions;

+ /// Returns true if \p V is expected to be vectorized and it needs to be

+ /// extracted.

+ bool needsExtract(Value *V, unsigned VF) const {

+ Instruction *I = dyn_cast<Instruction>(V);

+ if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))

+ return false;

+ // Assume we can vectorize V (and hence we need extraction) if the

+ // scalars are not computed yet. This can happen, because it is called

+ // via getScalarizationOverhead from setCostBasedWideningDecision, before

+ // the scalars are collected. That should be a safe assumption in most

+ // cases, because we check if the operands have vectorizable types

+ // beforehand in LoopVectorizationLegality.

+ return Scalars.find(VF) == Scalars.end() ||

+ !isScalarAfterVectorization(I, VF);

+ };

+ /// Returns a range containing only operands needing to be extracted.

+ SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,

+ unsigned VF) {

+ return SmallVector<Value *, 4>(make_filter_range(

+ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));

+ }

public:

/// The loop that we evaluate.

Loop *TheLoop;

@@ -1372,12 +1433,6 @@ static bool isExplicitVecOuterLoop(Loop *OuterLp,

return false;

}

- if (!Hints.getWidth()) {

- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");

- Hints.emitRemarkWithHints();

- return false;

- }

if (Hints.getInterleave() > 1) {

// TODO: Interleave support is future work.

LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "

@@ -1447,12 +1502,13 @@ struct LoopVectorize : public FunctionPass {

auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();

auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();

std::function<const LoopAccessInfo &(Loop &)> GetLAA =

[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };

return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,

- GetLAA, *ORE);

+ GetLAA, *ORE, PSI);

}

void getAnalysisUsage(AnalysisUsage &AU) const override {

@@ -1478,6 +1534,7 @@ struct LoopVectorize : public FunctionPass {

AU.addPreserved<BasicAAWrapperPass>();

AU.addPreserved<GlobalsAAWrapperPass>();

+ AU.addRequired<ProfileSummaryInfoWrapperPass>();

}

};

@@ -2051,7 +2108,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

// A[i] = b; // Member of index 0

// A[i+2] = c; // Member of index 2 (Current instruction)

// Current pointer is pointed to A[i+2], adjust it to A[i].

- NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));

+ NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));

if (InBounds)

cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);

@@ -2093,8 +2150,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,

GroupMask, UndefVec, "wide.masked.vec");

}

else

- NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],

- Group->getAlignment(), "wide.vec");

+ NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],

+ Group->getAlignment(), "wide.vec");

Group->addMetadata(NewLoad);

NewLoads.push_back(NewLoad);

}

@@ -2239,16 +2296,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

// If the address is consecutive but reversed, then the

// wide store needs to start at the last vector element.

PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));

+ Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));

PartPtr->setIsInBounds(InBounds);

PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));

+ Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));

PartPtr->setIsInBounds(InBounds);

if (isMaskRequired) // Reverse of a null all-one mask is a null mask.

Mask[Part] = reverseVector(Mask[Part]);

} else {

PartPtr = cast<GetElementPtrInst>(

- Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));

+ Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));

PartPtr->setIsInBounds(InBounds);

}

@@ -2305,7 +2362,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,

UndefValue::get(DataTy),

"wide.masked.load");

else

- NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");

+ NewLI =

+ Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");

// Add metadata to the load, but setVectorValue to the reverse shuffle.

addMetadata(NewLI, LI);

@@ -2665,7 +2723,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(

assert(isa<SCEVConstant>(Step) &&

"Expected constant step for pointer induction");

return B.CreateGEP(

- nullptr, StartValue,

+ StartValue->getType()->getPointerElementType(), StartValue,

CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),

&*B.GetInsertPoint())));

}

@@ -2849,26 +2907,42 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {

BCResumeVal->addIncoming(EndValue, MiddleBlock);

// Fix the scalar body counter (PHI node).

- unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);

// The old induction's phi node in the scalar body needs the truncated

// value.

for (BasicBlock *BB : LoopBypassBlocks)

BCResumeVal->addIncoming(II.getStartValue(), BB);

- OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);

+ OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);

}

+ // We need the OrigLoop (scalar loop part) latch terminator to help

+ // produce correct debug info for the middle block BB instructions.

+ // The legality check stage guarantees that the loop will have a single

+ // latch.

+ assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&

+ "Scalar loop latch terminator isn't a branch");

+ BranchInst *ScalarLatchBr =

+ cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());

// Add a check in the middle block to see if we have completed

// all of the iterations in the first vector loop.

// If (N - N%VF) == N, then we *don't* need to run the remainder.

// If tail is to be folded, we know we don't need to run the remainder.

Value *CmpN = Builder.getTrue();

- if (!Cost->foldTailByMasking())

+ if (!Cost->foldTailByMasking()) {

CmpN =

CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,

CountRoundDown, "cmp.n", MiddleBlock->getTerminator());

- ReplaceInstWithInst(MiddleBlock->getTerminator(),

- BranchInst::Create(ExitBlock, ScalarPH, CmpN));

+ // Here we use the same DebugLoc as the scalar loop latch branch instead

+ // of the corresponding compare because they may have ended up with

+ // different line numbers and we want to avoid awkward line stepping while

+ // debugging. Eg. if the compare has got a line number inside the loop.

+ cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());

+ }

+ BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);

+ BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());

+ ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);

// Get ready to start creating new instructions into the vectorized body.

Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());

@@ -3022,45 +3096,9 @@ static void cse(BasicBlock *BB) {

}

-/// Estimate the overhead of scalarizing an instruction. This is a

-/// convenience wrapper for the type-based getScalarizationOverhead API.

-static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,

- const TargetTransformInfo &TTI) {

- if (VF == 1)

- return 0;

- unsigned Cost = 0;

- Type *RetTy = ToVectorTy(I->getType(), VF);

- if (!RetTy->isVoidTy() &&

- (!isa<LoadInst>(I) ||

- !TTI.supportsEfficientVectorElementLoadStore()))

- Cost += TTI.getScalarizationOverhead(RetTy, true, false);

- // Some targets keep addresses scalar.

- if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())

- return Cost;

- if (CallInst *CI = dyn_cast<CallInst>(I)) {

- SmallVector<const Value *, 4> Operands(CI->arg_operands());

- Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);

- }

- else if (!isa<StoreInst>(I) ||

- !TTI.supportsEfficientVectorElementLoadStore()) {

- SmallVector<const Value *, 4> Operands(I->operand_values());

- Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);

- }

- return Cost;

-// Estimate cost of a call instruction CI if it were vectorized with factor VF.

-// Return the cost of the instruction, including scalarization overhead if it's

-// needed. The flag NeedToScalarize shows if the call needs to be scalarized -

-// i.e. either vector version isn't available, or is too expensive.

-static unsigned getVectorCallCost(CallInst *CI, unsigned VF,

- const TargetTransformInfo &TTI,

- const TargetLibraryInfo *TLI,

- bool &NeedToScalarize) {

+unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,

+ unsigned VF,

+ bool &NeedToScalarize) {

Function *F = CI->getCalledFunction();

StringRef FnName = CI->getCalledFunction()->getName();

Type *ScalarRetTy = CI->getType();

@@ -3083,7 +3121,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,

// Compute costs of unpacking argument values for the scalar calls and

// packing the return values to a vector.

- unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);

+ unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);

unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

@@ -3102,12 +3140,8 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,

return Cost;

}

-// Estimate cost of an intrinsic call instruction CI if it were vectorized with

-// factor VF. Return the cost of the instruction, including scalarization

-// overhead if it's needed.

-static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,

- const TargetTransformInfo &TTI,

- const TargetLibraryInfo *TLI) {

+unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,

+ unsigned VF) {

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

assert(ID && "Expected intrinsic call!");

@@ -3468,7 +3502,7 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

Start->addIncoming(Incoming, BB);

}

- Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);

+ Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);

Phi->setName("scalar.recur");

// Finally, fix users of the recurrence outside the loop. The users will need

@@ -3596,14 +3630,23 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

// Reduce all of the unrolled parts into a single vector.

Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);

unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);

- setDebugLocFromInst(Builder, ReducedPartRdx);

+ // The middle block terminator has already been assigned a DebugLoc here (the

+ // OrigLoop's single latch terminator). We want the whole middle block to

+ // appear to execute on this line because: (a) it is all compiler generated,

+ // (b) these instructions are always executed after evaluating the latch

+ // conditional branch, and (c) other passes may add new predecessors which

+ // terminate on this line. This is the easiest way to ensure we don't

+ // accidentally cause an extra step back into the loop while debugging.

+ setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());

for (unsigned Part = 1; Part < UF; ++Part) {

Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);

if (Op != Instruction::ICmp && Op != Instruction::FCmp)

// Floating point operations had to be 'fast' to enable the reduction.

ReducedPartRdx = addFastMathFlag(

Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,

- ReducedPartRdx, "bin.rdx"));

+ ReducedPartRdx, "bin.rdx"),

+ RdxDesc.getFastMathFlags());

else

ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,

RdxPart);

@@ -3935,9 +3978,11 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

// Create the new GEP. Note that this GEP may be a scalar if VF == 1,

// but it should be a vector, otherwise.

- auto *NewGEP = GEP->isInBounds()

- ? Builder.CreateInBoundsGEP(Ptr, Indices)

- : Builder.CreateGEP(Ptr, Indices);

+ auto *NewGEP =

+ GEP->isInBounds()

+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,

+ Indices)

+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);

assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&

"NewGEP is not a pointer vector");

VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);

@@ -3955,6 +4000,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

case Instruction::FAdd:

case Instruction::Sub:

case Instruction::FSub:

+ case Instruction::FNeg:

case Instruction::Mul:

case Instruction::FMul:

case Instruction::FDiv:

@@ -3965,21 +4011,22 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- // Just widen binops.

- auto *BinOp = cast<BinaryOperator>(&I);

- setDebugLocFromInst(Builder, BinOp);

+ // Just widen unops and binops.

+ setDebugLocFromInst(Builder, &I);

for (unsigned Part = 0; Part < UF; ++Part) {

- Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);

- Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);

- Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);

+ SmallVector<Value *, 2> Ops;

+ for (Value *Op : I.operands())

+ Ops.push_back(getOrCreateVectorValue(Op, Part));

- if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))

- VecOp->copyIRFlags(BinOp);

+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

+ if (auto *VecOp = dyn_cast<Instruction>(V))

+ VecOp->copyIRFlags(&I);

// Use this vector value for all users of the original instruction.

VectorLoopValueMap.setVectorValue(&I, Part, V);

- addMetadata(V, BinOp);

+ addMetadata(V, &I);

}

break;

@@ -4088,9 +4135,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

// version of the instruction.

// Is it beneficial to perform intrinsic call compared to lib call?

bool NeedToScalarize;

- unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);

+ unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);

bool UseVectorIntrinsic =

- ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;

+ ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;

assert((UseVectorIntrinsic || !NeedToScalarize) &&

"Instruction should be scalarized elsewhere.");

@@ -4395,6 +4442,13 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,

auto *Group = getInterleavedAccessGroup(I);

assert(Group && "Must have a group.");

+ // If the instruction's allocated size doesn't equal it's type size, it

+ // requires padding and will be scalarized.

+ auto &DL = I->getModule()->getDataLayout();

+ auto *ScalarTy = getMemInstValueType(I);

+ if (hasIrregularType(ScalarTy, DL, VF))

+ return false;

// Check if masking is required.

// A Group may need masking for one of two reasons: it resides in a block that

// needs predication, or it was decided to use masking to deal with gaps.

@@ -4987,6 +5041,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,

if (LoopCost == 0)

LoopCost = expectedCost(VF).first;

+ assert(LoopCost && "Non-zero loop cost expected");

// Clamp the calculated IC to be between the 1 and the max interleave count

// that the target allows.

if (IC > MaxInterleaveCount)

@@ -5314,15 +5370,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(

return true;

};

- // Returns true if an operand that cannot be scalarized must be extracted

- // from a vector. We will account for this scalarization overhead below. Note

- // that the non-void predicated instructions are placed in their own blocks,

- // and their return values are inserted into vectors. Thus, an extract would

- // still be required.

- auto needsExtract = [&](Instruction *I) -> bool {

- return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);

- };

// Compute the expected cost discount from scalarizing the entire expression

// feeding the predicated instruction. We currently only consider expressions

// that are single-use instruction chains.

@@ -5362,7 +5409,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(

"Instruction has non-scalar type");

if (canBeScalarized(J))

Worklist.push_back(J);

- else if (needsExtract(J))

+ else if (needsExtract(J, VF))

ScalarCost += TTI.getScalarizationOverhead(

ToVectorTy(J->getType(),VF), false, true);

}

@@ -5484,7 +5531,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

// Get the overhead of the extractelement and insertelement instructions

// we might create due to scalarization.

- Cost += getScalarizationOverhead(I, VF, TTI);

+ Cost += getScalarizationOverhead(I, VF);

// If we have a predicated store, it may not be executed for each vector

// lane. Scale the cost by the probability of executing the predicated

@@ -5636,6 +5683,36 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {

return VectorizationCostTy(C, TypeNotScalarized);

}

+unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

+ unsigned VF) {

+ if (VF == 1)

+ return 0;

+ unsigned Cost = 0;

+ Type *RetTy = ToVectorTy(I->getType(), VF);

+ if (!RetTy->isVoidTy() &&

+ (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))

+ Cost += TTI.getScalarizationOverhead(RetTy, true, false);

+ // Some targets keep addresses scalar.

+ if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())

+ return Cost;

+ // Some targets support efficient element stores.

+ if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())

+ return Cost;

+ // Collect operands to consider.

+ CallInst *CI = dyn_cast<CallInst>(I);

+ Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();

+ // Skip operands that do not require extraction/scalarization and do not incur

+ // any overhead.

+ return Cost + TTI.getOperandsScalarizationOverhead(

+ filterExtractingOperands(Ops, VF), VF);

void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {

if (VF == 1)

return;

@@ -5876,7 +5953,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

// The cost of insertelement and extractelement instructions needed for

// scalarization.

- Cost += getScalarizationOverhead(I, VF, TTI);

+ Cost += getScalarizationOverhead(I, VF);

// Scale the cost by the probability of executing the predicated blocks.

// This assumes the predicated block for each vector lane is equally

@@ -5916,6 +5993,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,

Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);

}

+ case Instruction::FNeg: {

+ unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;

+ return N * TTI.getArithmeticInstrCost(

+ I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,

+ TargetTransformInfo::OK_AnyValue,

+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,

+ I->getOperand(0));

+ }

case Instruction::Select: {

SelectInst *SI = cast<SelectInst>(I);

const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());

@@ -5997,16 +6082,16 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

case Instruction::Call: {

bool NeedToScalarize;

CallInst *CI = cast<CallInst>(I);

- unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);

+ unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);

if (getVectorIntrinsicIDForCall(CI, TLI))

- return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));

+ return std::min(CallCost, getVectorIntrinsicCost(CI, VF));

return CallCost;

}

default:

// The cost of executing VF copies of the scalar instruction. This opcode

// is unknown. Assume that it is the same as 'mul'.

return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +

- getScalarizationOverhead(I, VF, TTI);

+ getScalarizationOverhead(I, VF);

} // end of switch.

}

@@ -6027,10 +6112,13 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)

INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)

INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)

+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)

INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)

namespace llvm {

+Pass *createLoopVectorizePass() { return new LoopVectorize(); }

Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,

bool VectorizeOnlyWhenForced) {

return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);

@@ -6066,50 +6154,65 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

}

+// TODO: we could return a pair of values that specify the max VF and

+// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of

+// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment

+// doesn't have a cost model that can choose which plan to execute if

+// more than one is generated.

+static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,

+ LoopVectorizationCostModel &CM) {

+ unsigned WidestType;

+ std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();

+ return WidestVectorRegBits / WidestType;

VectorizationFactor

LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,

unsigned UserVF) {

- // Width 1 means no vectorization, cost 0 means uncomputed cost.

- const VectorizationFactor NoVectorization = {1U, 0U};

+ unsigned VF = UserVF;

// Outer loop handling: They may require CFG and instruction level

// transformations before even evaluating whether vectorization is profitable.

// Since we cannot modify the incoming IR, we need to build VPlan upfront in

// the vectorization pipeline.

if (!OrigLoop->empty()) {

- // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.

- // This won't be necessary when UserVF is not required in the VPlan-native

- // path.

- if (VPlanBuildStressTest && !UserVF)

- UserVF = 4;

+ // If the user doesn't provide a vectorization factor, determine a

+ // reasonable one.

+ if (!UserVF) {

+ VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);

+ LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");

+ // Make sure we have a VF > 1 for stress testing.

+ if (VPlanBuildStressTest && VF < 2) {

+ LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "

+ << "overriding computed VF.\n");

+ VF = 4;

+ }

assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");

- assert(UserVF && "Expected UserVF for outer loop vectorization.");

- assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");

- LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");

- buildVPlans(UserVF, UserVF);

+ assert(isPowerOf2_32(VF) && "VF needs to be a power of two");

+ LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF

+ << " to build VPlans.\n");

+ buildVPlans(VF, VF);

// For VPlan build stress testing, we bail out after VPlan construction.

if (VPlanBuildStressTest)

- return NoVectorization;

+ return VectorizationFactor::Disabled();

- return {UserVF, 0};

+ return {VF, 0};

}

LLVM_DEBUG(

dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "

"VPlan-native path.\n");

- return NoVectorization;

+ return VectorizationFactor::Disabled();

}

-VectorizationFactor

-LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {

+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,

+ unsigned UserVF) {

assert(OrigLoop->empty() && "Inner loop expected.");

- // Width 1 means no vectorization, cost 0 means uncomputed cost.

- const VectorizationFactor NoVectorization = {1U, 0U};

Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);

- if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.

- return NoVectorization;

+ if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.

+ return None;

// Invalidate interleave groups if all blocks of loop will be predicated.

if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&

@@ -6129,7 +6232,7 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {

CM.selectUserVectorizationFactor(UserVF);

buildVPlansWithVPRecipes(UserVF, UserVF);

LLVM_DEBUG(printPlans(dbgs()));

- return {UserVF, 0};

+ return {{UserVF, 0}};

}

unsigned MaxVF = MaybeMaxVF.getValue();

@@ -6148,7 +6251,7 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {

buildVPlansWithVPRecipes(1, MaxVF);

LLVM_DEBUG(printPlans(dbgs()));

if (MaxVF == 1)

- return NoVectorization;

+ return VectorizationFactor::Disabled();

// Select the optimal vectorization factor.

return CM.selectVectorizationFactor(MaxVF);

@@ -6527,6 +6630,7 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,

case Instruction::FCmp:

case Instruction::FDiv:

case Instruction::FMul:

+ case Instruction::FNeg:

case Instruction::FPExt:

case Instruction::FPToSI:

case Instruction::FPToUI:

@@ -6582,9 +6686,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,

// version of the instruction.

// Is it beneficial to perform intrinsic call compared to lib call?

bool NeedToScalarize;

- unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);

+ unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);

bool UseVectorIntrinsic =

- ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;

+ ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;

return UseVectorIntrinsic || !NeedToScalarize;

}

if (isa<LoadInst>(I) || isa<StoreInst>(I)) {

@@ -6756,8 +6860,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,

}

-LoopVectorizationPlanner::VPlanPtr

-LoopVectorizationPlanner::buildVPlanWithVPRecipes(

+VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,

SmallPtrSetImpl<Instruction *> &DeadInstructions) {

// Hold a mapping from predicated instructions to their recipes, in order to

@@ -6772,7 +6875,7 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes(

VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");

auto Plan = llvm::make_unique<VPlan>(VPBB);

- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);

+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

// Represent values that will have defs inside VPlan.

for (Value *V : NeedDef)

Plan->addVPValue(V);

@@ -6881,8 +6984,7 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes(

return Plan;

}

-LoopVectorizationPlanner::VPlanPtr

-LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

+VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

// Outer loop handling: They may require CFG and instruction level

// transformations before even evaluating whether vectorization is profitable.

// Since we cannot modify the incoming IR, we need to build VPlan upfront in

@@ -6897,13 +6999,22 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);

HCFGBuilder.buildHierarchicalCFG();

+ for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)

+ Plan->addVF(VF);

+ if (EnableVPlanPredication) {

+ VPlanPredicator VPP(*Plan);

+ VPP.predicate();

+ // Avoid running transformation to recipes until masked code generation in

+ // VPlan-native path is in place.

+ return Plan;

+ }

SmallPtrSet<Instruction *, 1> DeadInstructions;

VPlanHCFGTransforms::VPInstructionsToVPRecipes(

Plan, Legal->getInductionVars(), DeadInstructions);

- for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)

- Plan->addVF(VF);

return Plan;

}

@@ -7096,7 +7207,8 @@ static bool processLoopInVPlanNativePath(

Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,

LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,

TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,

- OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {

+ OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,

+ ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {

assert(EnableVPlanNativePath && "VPlan-native path is disabled.");

Function *F = L->getHeader()->getParent();

@@ -7109,24 +7221,28 @@ static bool processLoopInVPlanNativePath(

LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);

// Get user vectorization factor.

- unsigned UserVF = Hints.getWidth();

+ const unsigned UserVF = Hints.getWidth();

- // Check the function attributes to find out if this function should be

- // optimized for size.

+ // Check the function attributes and profiles to find out if this function

+ // should be optimized for size.

bool OptForSize =

- Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();

+ Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

+ (F->hasOptSize() ||

+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));

// Plan how to best vectorize, return the best VF and its cost.

- VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);

+ const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);

// If we are stress testing VPlan builds, do not attempt to generate vector

- // code.

- if (VPlanBuildStressTest)

+ // code. Masked vector code generation support will follow soon.

+ // Also, do not attempt to vectorize if no vector code will be produced.

+ if (VPlanBuildStressTest || EnableVPlanPredication ||

+ VectorizationFactor::Disabled() == VF)

return false;

LVP.setBestPlan(VF.Width, 1);

- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,

+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,

&CM);

LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""

<< L->getHeader()->getParent()->getName() << "\"\n");

@@ -7184,7 +7300,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check if it is legal to vectorize the loop.

LoopVectorizationRequirements Requirements(*ORE);

- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,

+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,

&Requirements, &Hints, DB, AC);

if (!LVL.canVectorize(EnableVPlanNativePath)) {

LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");

@@ -7192,10 +7308,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {

return false;

}

- // Check the function attributes to find out if this function should be

- // optimized for size.

+ // Check the function attributes and profiles to find out if this function

+ // should be optimized for size.

bool OptForSize =

- Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();

+ Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

+ (F->hasOptSize() ||

+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));

// Entrance to the VPlan-native vectorization path. Outer loops are processed

// here. They may require CFG and instruction level transformations before

@@ -7204,7 +7322,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// pipeline.

if (!L->empty())

return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,

- ORE, Hints);

+ ORE, BFI, PSI, Hints);

assert(L->empty() && "Inner loop expected.");

// Check the loop for a trip count threshold: vectorize loops with a tiny trip

@@ -7304,14 +7422,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {

unsigned UserVF = Hints.getWidth();

// Plan how to best vectorize, return the best VF and its cost.

- VectorizationFactor VF = LVP.plan(OptForSize, UserVF);

+ Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);

- // Select the interleave count.

- unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);

- // Get user interleave count.

+ VectorizationFactor VF = VectorizationFactor::Disabled();

+ unsigned IC = 1;

unsigned UserIC = Hints.getInterleave();

+ if (MaybeVF) {

+ VF = *MaybeVF;

+ // Select the interleave count.

+ IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);

+ }

// Identify the diagnostic messages that should be produced.

std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;

bool VectorizeLoop = true, InterleaveLoop = true;

@@ -7330,7 +7452,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {

VectorizeLoop = false;

}

- if (IC == 1 && UserIC <= 1) {

+ if (!MaybeVF && UserIC > 1) {

+ // Tell the user interleaving was avoided up-front, despite being explicitly

+ // requested.

+ LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "

+ "interleaving should be avoided up front\n");

+ IntDiagMsg = std::make_pair(

+ "InterleavingAvoided",

+ "Ignoring UserIC, because interleaving was avoided up front");

+ InterleaveLoop = false;

+ } else if (IC == 1 && UserIC <= 1) {

// Tell the user interleaving is not beneficial.

LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");

IntDiagMsg = std::make_pair(

@@ -7457,7 +7588,7 @@ bool LoopVectorizePass::runImpl(

DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,

DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,

std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,

- OptimizationRemarkEmitter &ORE_) {

+ OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {

SE = &SE_;

LI = &LI_;

TTI = &TTI_;

@@ -7469,6 +7600,7 @@ bool LoopVectorizePass::runImpl(

GetLAA = &GetLAA_;

DB = &DB_;

ORE = &ORE_;

+ PSI = PSI_;

// Don't attempt if

// 1. the target claims to have no vector registers, and

@@ -7488,7 +7620,8 @@ bool LoopVectorizePass::runImpl(

// will simplify all loops, regardless of whether anything end up being

// vectorized.

for (auto &L : *LI)

- Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);

+ Changed |=

+ simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);

// Build up a worklist of inner-loops to vectorize. This is necessary as

// the act of vectorizing or partially unrolling a loop creates new loops

@@ -7527,15 +7660,22 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &AC = AM.getResult<AssumptionAnalysis>(F);

auto &DB = AM.getResult<DemandedBitsAnalysis>(F);

auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

+ MemorySSA *MSSA = EnableMSSALoopDependency

+ ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()

+ : nullptr;

auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();

std::function<const LoopAccessInfo &(Loop &)> GetLAA =

[&](Loop &L) -> const LoopAccessInfo & {

- LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};

+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};

return LAM.getResult<LoopAccessAnalysis>(L, AR);

};

+ const ModuleAnalysisManager &MAM =

+ AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();

+ ProfileSummaryInfo *PSI =

+ MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());

bool Changed =

- runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);

+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);

if (!Changed)

return PreservedAnalyses::all();

PreservedAnalyses PA;

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2e856a7e6802..27a86c0bca91 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -1,9 +1,8 @@

//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -106,6 +105,10 @@ using namespace slpvectorizer;

STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

+cl::opt<bool>

+ llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,

+ cl::desc("Run the SLP vectorization passes"));

static cl::opt<int>

SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,

cl::desc("Only vectorize if you gain more than this "

@@ -207,6 +210,13 @@ static bool isSplat(ArrayRef<Value *> VL) {

return true;

}

+/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.

+static bool isCommutative(Instruction *I) {

+ if (auto *IC = dyn_cast<CmpInst>(I))

+ return IC->isCommutative();

+ return I->isCommutative();

/// Checks if the vector of instructions can be represented as a shuffle, like:

/// %x0 = extractelement <4 x i8> %x, i32 0

/// %x3 = extractelement <4 x i8> %x, i32 3

@@ -438,8 +448,9 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

case Instruction::Call: {

CallInst *CI = cast<CallInst>(UserInst);

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- if (hasVectorInstrinsicScalarOpd(ID, 1)) {

- return (CI->getArgOperand(1) == Scalar);

+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

+ if (hasVectorInstrinsicScalarOpd(ID, i))

+ return (CI->getArgOperand(i) == Scalar);

}

LLVM_FALLTHROUGH;

}

@@ -474,6 +485,8 @@ namespace slpvectorizer {

/// Bottom Up SLP Vectorizer.

class BoUpSLP {

+ struct TreeEntry;

public:

using ValueList = SmallVector<Value *, 8>;

using InstrList = SmallVector<Instruction *, 16>;

@@ -517,7 +530,7 @@ public:

/// \returns the cost incurred by unwanted spills and fills, caused by

/// holding live values over call sites.

- int getSpillCost();

+ int getSpillCost() const;

/// \returns the vectorization cost of the subtree that starts at \p VL.

/// A negative number means that this is profitable.

@@ -576,7 +589,7 @@ public:

/// the stored value. Otherwise, the size is the width of the largest loaded

/// value reaching V. This method is used by the vectorizer to calculate

/// vectorization factors.

- unsigned getVectorElementSize(Value *V);

+ unsigned getVectorElementSize(Value *V) const;

/// Compute the minimum type sizes required to represent the entries in a

/// vectorizable tree.

@@ -599,13 +612,512 @@ public:

/// \returns True if the VectorizableTree is both tiny and not fully

/// vectorizable. We do not vectorize such trees.

- bool isTreeTinyAndNotFullyVectorizable();

+ bool isTreeTinyAndNotFullyVectorizable() const;

OptimizationRemarkEmitter *getORE() { return ORE; }

-private:

- struct TreeEntry;

+ /// This structure holds any data we need about the edges being traversed

+ /// during buildTree_rec(). We keep track of:

+ /// (i) the user TreeEntry index, and

+ /// (ii) the index of the edge.

+ struct EdgeInfo {

+ EdgeInfo() = default;

+ EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)

+ : UserTE(UserTE), EdgeIdx(EdgeIdx) {}

+ /// The user TreeEntry.

+ TreeEntry *UserTE = nullptr;

+ /// The operand index of the use.

+ unsigned EdgeIdx = UINT_MAX;

+#ifndef NDEBUG

+ friend inline raw_ostream &operator<<(raw_ostream &OS,

+ const BoUpSLP::EdgeInfo &EI) {

+ EI.dump(OS);

+ return OS;

+ }

+ /// Debug print.

+ void dump(raw_ostream &OS) const {

+ OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")

+ << " EdgeIdx:" << EdgeIdx << "}";

+ }

+ LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }

+#endif

+ };

+ /// A helper data structure to hold the operands of a vector of instructions.

+ /// This supports a fixed vector length for all operand vectors.

+ class VLOperands {

+ /// For each operand we need (i) the value, and (ii) the opcode that it

+ /// would be attached to if the expression was in a left-linearized form.

+ /// This is required to avoid illegal operand reordering.

+ /// For example:

+ /// \verbatim

+ /// 0 Op1

+ /// |/

+ /// Op1 Op2 Linearized + Op2

+ /// \ / ----------> |/

+ /// - -

+ ///

+ /// Op1 - Op2 (0 + Op1) - Op2

+ /// \endverbatim

+ ///

+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

+ ///

+ /// Another way to think of this is to track all the operations across the

+ /// path from the operand all the way to the root of the tree and to

+ /// calculate the operation that corresponds to this path. For example, the

+ /// path from Op2 to the root crosses the RHS of the '-', therefore the

+ /// corresponding operation is a '-' (which matches the one in the

+ /// linearized tree, as shown above).

+ ///

+ /// For lack of a better term, we refer to this operation as Accumulated

+ /// Path Operation (APO).

+ struct OperandData {

+ OperandData() = default;

+ OperandData(Value *V, bool APO, bool IsUsed)

+ : V(V), APO(APO), IsUsed(IsUsed) {}

+ /// The operand value.

+ Value *V = nullptr;

+ /// TreeEntries only allow a single opcode, or an alternate sequence of

+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the

+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation

+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

+ /// (e.g., Add/Mul)

+ bool APO = false;

+ /// Helper data for the reordering function.

+ bool IsUsed = false;

+ };

+ /// During operand reordering, we are trying to select the operand at lane

+ /// that matches best with the operand at the neighboring lane. Our

+ /// selection is based on the type of value we are looking for. For example,

+ /// if the neighboring lane has a load, we need to look for a load that is

+ /// accessing a consecutive address. These strategies are summarized in the

+ /// 'ReorderingMode' enumerator.

+ enum class ReorderingMode {

+ Load, ///< Matching loads to consecutive memory addresses

+ Opcode, ///< Matching instructions based on opcode (same or alternate)

+ Constant, ///< Matching constants

+ Splat, ///< Matching the same instruction multiple times (broadcast)

+ Failed, ///< We failed to create a vectorizable group

+ };

+ using OperandDataVec = SmallVector<OperandData, 2>;

+ /// A vector of operand vectors.

+ SmallVector<OperandDataVec, 4> OpsVec;

+ const DataLayout &DL;

+ ScalarEvolution &SE;

+ /// \returns the operand data at \p OpIdx and \p Lane.

+ OperandData &getData(unsigned OpIdx, unsigned Lane) {

+ return OpsVec[OpIdx][Lane];

+ }

+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.

+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {

+ return OpsVec[OpIdx][Lane];

+ }

+ /// Clears the used flag for all entries.

+ void clearUsed() {

+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();

+ OpIdx != NumOperands; ++OpIdx)

+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

+ ++Lane)

+ OpsVec[OpIdx][Lane].IsUsed = false;

+ }

+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {

+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

+ }

+ // Search all operands in Ops[*][Lane] for the one that matches best

+ // Ops[OpIdx][LastLane] and return its opreand index.

+ // If no good match can be found, return None.

+ Optional<unsigned>

+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,

+ ArrayRef<ReorderingMode> ReorderingModes) {

+ unsigned NumOperands = getNumOperands();

+ // The operand of the previous lane at OpIdx.

+ Value *OpLastLane = getData(OpIdx, LastLane).V;

+ // Our strategy mode for OpIdx.

+ ReorderingMode RMode = ReorderingModes[OpIdx];

+ // The linearized opcode of the operand at OpIdx, Lane.

+ bool OpIdxAPO = getData(OpIdx, Lane).APO;

+ const unsigned BestScore = 2;

+ const unsigned GoodScore = 1;

+ // The best operand index and its score.

+ // Sometimes we have more than one option (e.g., Opcode and Undefs), so we

+ // are using the score to differentiate between the two.

+ struct BestOpData {

+ Optional<unsigned> Idx = None;

+ unsigned Score = 0;

+ } BestOp;

+ // Iterate through all unused operands and look for the best.

+ for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {

+ // Get the operand at Idx and Lane.

+ OperandData &OpData = getData(Idx, Lane);

+ Value *Op = OpData.V;

+ bool OpAPO = OpData.APO;

+ // Skip already selected operands.

+ if (OpData.IsUsed)

+ continue;

+ // Skip if we are trying to move the operand to a position with a

+ // different opcode in the linearized tree form. This would break the

+ // semantics.

+ if (OpAPO != OpIdxAPO)

+ continue;

+ // Look for an operand that matches the current mode.

+ switch (RMode) {

+ case ReorderingMode::Load:

+ if (isa<LoadInst>(Op)) {

+ // Figure out which is left and right, so that we can check for

+ // consecutive loads

+ bool LeftToRight = Lane > LastLane;

+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;

+ if (isConsecutiveAccess(cast<LoadInst>(OpLeft),

+ cast<LoadInst>(OpRight), DL, SE))

+ BestOp.Idx = Idx;

+ }

+ break;

+ case ReorderingMode::Opcode:

+ // We accept both Instructions and Undefs, but with different scores.

+ if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&

+ cast<Instruction>(Op)->getOpcode() ==

+ cast<Instruction>(OpLastLane)->getOpcode()) ||

+ (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||

+ isa<UndefValue>(Op)) {

+ // An instruction has a higher score than an undef.

+ unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;

+ if (Score > BestOp.Score) {

+ BestOp.Idx = Idx;

+ BestOp.Score = Score;

+ }

+ break;

+ case ReorderingMode::Constant:

+ if (isa<Constant>(Op)) {

+ unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;

+ if (Score > BestOp.Score) {

+ BestOp.Idx = Idx;

+ BestOp.Score = Score;

+ }

+ break;

+ case ReorderingMode::Splat:

+ if (Op == OpLastLane)

+ BestOp.Idx = Idx;

+ break;

+ case ReorderingMode::Failed:

+ return None;

+ }

+ if (BestOp.Idx) {

+ getData(BestOp.Idx.getValue(), Lane).IsUsed = true;

+ return BestOp.Idx;

+ }

+ // If we could not find a good match return None.

+ return None;

+ }

+ /// Helper for reorderOperandVecs. \Returns the lane that we should start

+ /// reordering from. This is the one which has the least number of operands

+ /// that can freely move about.

+ unsigned getBestLaneToStartReordering() const {

+ unsigned BestLane = 0;

+ unsigned Min = UINT_MAX;

+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

+ ++Lane) {

+ unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);

+ if (NumFreeOps < Min) {

+ Min = NumFreeOps;

+ BestLane = Lane;

+ }

+ return BestLane;

+ }

+ /// \Returns the maximum number of operands that are allowed to be reordered

+ /// for \p Lane. This is used as a heuristic for selecting the first lane to

+ /// start operand reordering.

+ unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

+ unsigned CntTrue = 0;

+ unsigned NumOperands = getNumOperands();

+ // Operands with the same APO can be reordered. We therefore need to count

+ // how many of them we have for each APO, like this: Cnt[APO] = x.

+ // Since we only have two APOs, namely true and false, we can avoid using

+ // a map. Instead we can simply count the number of operands that

+ // correspond to one of them (in this case the 'true' APO), and calculate

+ // the other by subtracting it from the total number of operands.

+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)

+ if (getData(OpIdx, Lane).APO)

+ ++CntTrue;

+ unsigned CntFalse = NumOperands - CntTrue;

+ return std::max(CntTrue, CntFalse);

+ }

+ /// Go through the instructions in VL and append their operands.

+ void appendOperandsOfVL(ArrayRef<Value *> VL) {

+ assert(!VL.empty() && "Bad VL");

+ assert((empty() || VL.size() == getNumLanes()) &&

+ "Expected same number of lanes");

+ assert(isa<Instruction>(VL[0]) && "Expected instruction");

+ unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();

+ OpsVec.resize(NumOperands);

+ unsigned NumLanes = VL.size();

+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

+ OpsVec[OpIdx].resize(NumLanes);

+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

+ assert(isa<Instruction>(VL[Lane]) && "Expected instruction");

+ // Our tree has just 3 nodes: the root and two operands.

+ // It is therefore trivial to get the APO. We only need to check the

+ // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or

+ // RHS operand. The LHS operand of both add and sub is never attached

+ // to an inversese operation in the linearized form, therefore its APO

+ // is false. The RHS is true only if VL[Lane] is an inverse operation.

+ // Since operand reordering is performed on groups of commutative

+ // operations or alternating sequences (e.g., +, -), we can safely

+ // tell the inverse operations by checking commutativity.

+ bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));

+ bool APO = (OpIdx == 0) ? false : IsInverseOperation;

+ OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),

+ APO, false};

+ }

+ /// \returns the number of operands.

+ unsigned getNumOperands() const { return OpsVec.size(); }

+ /// \returns the number of lanes.

+ unsigned getNumLanes() const { return OpsVec[0].size(); }

+ /// \returns the operand value at \p OpIdx and \p Lane.

+ Value *getValue(unsigned OpIdx, unsigned Lane) const {

+ return getData(OpIdx, Lane).V;

+ }

+ /// \returns true if the data structure is empty.

+ bool empty() const { return OpsVec.empty(); }

+ /// Clears the data.

+ void clear() { OpsVec.clear(); }

+ /// \Returns true if there are enough operands identical to \p Op to fill

+ /// the whole vector.

+ /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.

+ bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {

+ bool OpAPO = getData(OpIdx, Lane).APO;

+ for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

+ if (Ln == Lane)

+ continue;

+ // This is set to true if we found a candidate for broadcast at Lane.

+ bool FoundCandidate = false;

+ for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {

+ OperandData &Data = getData(OpI, Ln);

+ if (Data.APO != OpAPO || Data.IsUsed)

+ continue;

+ if (Data.V == Op) {

+ FoundCandidate = true;

+ Data.IsUsed = true;

+ break;

+ }

+ if (!FoundCandidate)

+ return false;

+ }

+ return true;

+ }

+ public:

+ /// Initialize with all the operands of the instruction vector \p RootVL.

+ VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,

+ ScalarEvolution &SE)

+ : DL(DL), SE(SE) {

+ // Append all the operands of RootVL.

+ appendOperandsOfVL(RootVL);

+ }

+ /// \Returns a value vector with the operands across all lanes for the

+ /// opearnd at \p OpIdx.

+ ValueList getVL(unsigned OpIdx) const {

+ ValueList OpVL(OpsVec[OpIdx].size());

+ assert(OpsVec[OpIdx].size() == getNumLanes() &&

+ "Expected same num of lanes across all operands");

+ for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)

+ OpVL[Lane] = OpsVec[OpIdx][Lane].V;

+ return OpVL;

+ }

+ // Performs operand reordering for 2 or more operands.

+ // The original operands are in OrigOps[OpIdx][Lane].

+ // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.

+ void reorder() {

+ unsigned NumOperands = getNumOperands();

+ unsigned NumLanes = getNumLanes();

+ // Each operand has its own mode. We are using this mode to help us select

+ // the instructions for each lane, so that they match best with the ones

+ // we have selected so far.

+ SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);

+ // This is a greedy single-pass algorithm. We are going over each lane

+ // once and deciding on the best order right away with no back-tracking.

+ // However, in order to increase its effectiveness, we start with the lane

+ // that has operands that can move the least. For example, given the

+ // following lanes:

+ // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd

+ // Lane 1 : A[1] = C[1] - B[1] // Visited 1st

+ // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd

+ // Lane 3 : A[3] = C[3] - B[3] // Visited 4th

+ // we will start at Lane 1, since the operands of the subtraction cannot

+ // be reordered. Then we will visit the rest of the lanes in a circular

+ // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.

+ // Find the first lane that we will start our search from.

+ unsigned FirstLane = getBestLaneToStartReordering();

+ // Initialize the modes.

+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

+ Value *OpLane0 = getValue(OpIdx, FirstLane);

+ // Keep track if we have instructions with all the same opcode on one

+ // side.

+ if (isa<LoadInst>(OpLane0))

+ ReorderingModes[OpIdx] = ReorderingMode::Load;

+ else if (isa<Instruction>(OpLane0)) {

+ // Check if OpLane0 should be broadcast.

+ if (shouldBroadcast(OpLane0, OpIdx, FirstLane))

+ ReorderingModes[OpIdx] = ReorderingMode::Splat;

+ else

+ ReorderingModes[OpIdx] = ReorderingMode::Opcode;

+ }

+ else if (isa<Constant>(OpLane0))

+ ReorderingModes[OpIdx] = ReorderingMode::Constant;

+ else if (isa<Argument>(OpLane0))

+ // Our best hope is a Splat. It may save some cost in some cases.

+ ReorderingModes[OpIdx] = ReorderingMode::Splat;

+ else

+ // NOTE: This should be unreachable.

+ ReorderingModes[OpIdx] = ReorderingMode::Failed;

+ }

+ // If the initial strategy fails for any of the operand indexes, then we

+ // perform reordering again in a second pass. This helps avoid assigning

+ // high priority to the failed strategy, and should improve reordering for

+ // the non-failed operand indexes.

+ for (int Pass = 0; Pass != 2; ++Pass) {

+ // Skip the second pass if the first pass did not fail.

+ bool StrategyFailed = false;

+ // Mark all operand data as free to use.

+ clearUsed();

+ // We keep the original operand order for the FirstLane, so reorder the

+ // rest of the lanes. We are visiting the nodes in a circular fashion,

+ // using FirstLane as the center point and increasing the radius

+ // distance.

+ for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {

+ // Visit the lane on the right and then the lane on the left.

+ for (int Direction : {+1, -1}) {

+ int Lane = FirstLane + Direction * Distance;

+ if (Lane < 0 || Lane >= (int)NumLanes)

+ continue;

+ int LastLane = Lane - Direction;

+ assert(LastLane >= 0 && LastLane < (int)NumLanes &&

+ "Out of bounds");

+ // Look for a good match for each operand.

+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

+ // Search for the operand that matches SortedOps[OpIdx][Lane-1].

+ Optional<unsigned> BestIdx =

+ getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);

+ // By not selecting a value, we allow the operands that follow to

+ // select a better matching value. We will get a non-null value in

+ // the next run of getBestOperand().

+ if (BestIdx) {

+ // Swap the current operand with the one returned by

+ // getBestOperand().

+ swap(OpIdx, BestIdx.getValue(), Lane);

+ } else {

+ // We failed to find a best operand, set mode to 'Failed'.

+ ReorderingModes[OpIdx] = ReorderingMode::Failed;

+ // Enable the second pass.

+ StrategyFailed = true;

+ }

+ // Skip second pass if the strategy did not fail.

+ if (!StrategyFailed)

+ break;

+ }

+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

+ LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {

+ switch (RMode) {

+ case ReorderingMode::Load:

+ return "Load";

+ case ReorderingMode::Opcode:

+ return "Opcode";

+ case ReorderingMode::Constant:

+ return "Constant";

+ case ReorderingMode::Splat:

+ return "Splat";

+ case ReorderingMode::Failed:

+ return "Failed";

+ }

+ llvm_unreachable("Unimplemented Reordering Type");

+ }

+ LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,

+ raw_ostream &OS) {

+ return OS << getModeStr(RMode);

+ }

+ /// Debug print.

+ LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {

+ printMode(RMode, dbgs());

+ }

+ friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {

+ return printMode(RMode, OS);

+ }

+ LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {

+ const unsigned Indent = 2;

+ unsigned Cnt = 0;

+ for (const OperandDataVec &OpDataVec : OpsVec) {

+ OS << "Operand " << Cnt++ << "\n";

+ for (const OperandData &OpData : OpDataVec) {

+ OS.indent(Indent) << "{";

+ if (Value *V = OpData.V)

+ OS << *V;

+ else

+ OS << "null";

+ OS << ", APO:" << OpData.APO << "}\n";

+ }

+ OS << "\n";

+ }

+ return OS;

+ }

+ /// Debug print.

+ LLVM_DUMP_METHOD void dump() const { print(dbgs()); }

+#endif

+ };

+private:

/// Checks if all users of \p I are the part of the vectorization tree.

bool areAllUsersVectorized(Instruction *I) const;

@@ -613,7 +1125,8 @@ private:

int getEntryCost(TreeEntry *E);

/// This is the recursive part of buildTree.

- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);

+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,

+ const EdgeInfo &EI);

/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can

/// be vectorized to use the original vector (or aggregate "bitcast" to a

@@ -631,12 +1144,12 @@ private:

/// \returns the scalarization cost for this type. Scalarization in this

/// context means the creation of vectors from a group of scalars.

- int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);

+ int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;

/// \returns the scalarization cost for this list of values. Assuming that

/// this subtree gets vectorized, we may need to extract the values from the

/// roots. This method calculates the cost of extracting the values.

- int getGatherCost(ArrayRef<Value *> VL);

+ int getGatherCost(ArrayRef<Value *> VL) const;

/// Set the Builder insert point to one after the last instruction in

/// the bundle

@@ -648,22 +1161,18 @@ private:

/// \returns whether the VectorizableTree is fully vectorizable and will

/// be beneficial even the tree height is tiny.

- bool isFullyVectorizableTinyTree();

+ bool isFullyVectorizableTinyTree() const;

- /// \reorder commutative operands in alt shuffle if they result in

- /// vectorized code.

- void reorderAltShuffleOperands(const InstructionsState &S,

- ArrayRef<Value *> VL,

- SmallVectorImpl<Value *> &Left,

- SmallVectorImpl<Value *> &Right);

- /// \reorder commutative operands to get better probability of

+ /// Reorder commutative or alt operands to get better probability of

/// generating vectorized code.

- void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,

- SmallVectorImpl<Value *> &Left,

- SmallVectorImpl<Value *> &Right);

+ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

+ SmallVectorImpl<Value *> &Left,

+ SmallVectorImpl<Value *> &Right,

+ const DataLayout &DL,

+ ScalarEvolution &SE);

struct TreeEntry {

- TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}

+ using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

+ TreeEntry(VecTreeTy &Container) : Container(Container) {}

/// \returns true if the scalars in VL are equal to this entry.

bool isSame(ArrayRef<Value *> VL) const {

@@ -696,20 +1205,103 @@ private:

/// to be a pointer and needs to be able to initialize the child iterator.

/// Thus we need a reference back to the container to translate the indices

/// to entries.

- std::vector<TreeEntry> &Container;

+ VecTreeTy &Container;

/// The TreeEntry index containing the user of this entry. We can actually

/// have multiple users so the data structure is not truly a tree.

- SmallVector<int, 1> UserTreeIndices;

+ SmallVector<EdgeInfo, 1> UserTreeIndices;

+ /// The index of this treeEntry in VectorizableTree.

+ int Idx = -1;

+ private:

+ /// The operands of each instruction in each lane Operands[op_index][lane].

+ /// Note: This helps avoid the replication of the code that performs the

+ /// reordering of operands during buildTree_rec() and vectorizeTree().

+ SmallVector<ValueList, 2> Operands;

+ public:

+ /// Set this bundle's \p OpIdx'th operand to \p OpVL.

+ void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL,

+ ArrayRef<unsigned> ReuseShuffleIndices) {

+ if (Operands.size() < OpIdx + 1)

+ Operands.resize(OpIdx + 1);

+ assert(Operands[OpIdx].size() == 0 && "Already resized?");

+ Operands[OpIdx].resize(Scalars.size());

+ for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)

+ Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty())

+ ? OpVL[ReuseShuffleIndices[Lane]]

+ : OpVL[Lane];

+ }

+ /// If there is a user TreeEntry, then set its operand.

+ void trySetUserTEOperand(const EdgeInfo &UserTreeIdx,

+ ArrayRef<Value *> OpVL,

+ ArrayRef<unsigned> ReuseShuffleIndices) {

+ if (UserTreeIdx.UserTE)

+ UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL,

+ ReuseShuffleIndices);

+ }

+ /// \returns the \p OpIdx operand of this TreeEntry.

+ ValueList &getOperand(unsigned OpIdx) {

+ assert(OpIdx < Operands.size() && "Off bounds");

+ return Operands[OpIdx];

+ }

+ /// \return the single \p OpIdx operand.

+ Value *getSingleOperand(unsigned OpIdx) const {

+ assert(OpIdx < Operands.size() && "Off bounds");

+ assert(!Operands[OpIdx].empty() && "No operand available");

+ return Operands[OpIdx][0];

+ }

+#ifndef NDEBUG

+ /// Debug printer.

+ LLVM_DUMP_METHOD void dump() const {

+ dbgs() << Idx << ".\n";

+ for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {

+ dbgs() << "Operand " << OpI << ":\n";

+ for (const Value *V : Operands[OpI])

+ dbgs().indent(2) << *V << "\n";

+ }

+ dbgs() << "Scalars: \n";

+ for (Value *V : Scalars)

+ dbgs().indent(2) << *V << "\n";

+ dbgs() << "NeedToGather: " << NeedToGather << "\n";

+ dbgs() << "VectorizedValue: ";

+ if (VectorizedValue)

+ dbgs() << *VectorizedValue;

+ else

+ dbgs() << "NULL";

+ dbgs() << "\n";

+ dbgs() << "ReuseShuffleIndices: ";

+ if (ReuseShuffleIndices.empty())

+ dbgs() << "Emtpy";

+ else

+ for (unsigned Idx : ReuseShuffleIndices)

+ dbgs() << Idx << ", ";

+ dbgs() << "\n";

+ dbgs() << "ReorderIndices: ";

+ for (unsigned Idx : ReorderIndices)

+ dbgs() << Idx << ", ";

+ dbgs() << "\n";

+ dbgs() << "UserTreeIndices: ";

+ for (const auto &EInfo : UserTreeIndices)

+ dbgs() << EInfo << ", ";

+ dbgs() << "\n";

+ }

+#endif

};

/// Create a new VectorizableTree entry.

- void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,

- ArrayRef<unsigned> ReuseShuffleIndices = None,

- ArrayRef<unsigned> ReorderIndices = None) {

- VectorizableTree.emplace_back(VectorizableTree);

- int idx = VectorizableTree.size() - 1;

- TreeEntry *Last = &VectorizableTree[idx];

+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,

+ const EdgeInfo &UserTreeIdx,

+ ArrayRef<unsigned> ReuseShuffleIndices = None,

+ ArrayRef<unsigned> ReorderIndices = None) {

+ VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree));

+ TreeEntry *Last = VectorizableTree.back().get();

+ Last->Idx = VectorizableTree.size() - 1;

Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

Last->NeedToGather = !Vectorized;

Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

@@ -718,25 +1310,44 @@ private:

if (Vectorized) {

for (int i = 0, e = VL.size(); i != e; ++i) {

assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");

- ScalarToTreeEntry[VL[i]] = idx;

+ ScalarToTreeEntry[VL[i]] = Last->Idx;

}

} else {

MustGather.insert(VL.begin(), VL.end());

}

- if (UserTreeIdx >= 0)

+ if (UserTreeIdx.UserTE)

Last->UserTreeIndices.push_back(UserTreeIdx);

- UserTreeIdx = idx;

+ Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices);

+ return Last;

}

/// -- Vectorization State --

/// Holds all of the tree entries.

- std::vector<TreeEntry> VectorizableTree;

+ TreeEntry::VecTreeTy VectorizableTree;

+#ifndef NDEBUG

+ /// Debug printer.

+ LLVM_DUMP_METHOD void dumpVectorizableTree() const {

+ for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {

+ VectorizableTree[Id]->dump();

+ dbgs() << "\n";

+ }

+#endif

TreeEntry *getTreeEntry(Value *V) {

auto I = ScalarToTreeEntry.find(V);

if (I != ScalarToTreeEntry.end())

- return &VectorizableTree[I->second];

+ return VectorizableTree[I->second].get();

+ return nullptr;

+ }

+ const TreeEntry *getTreeEntry(Value *V) const {

+ auto I = ScalarToTreeEntry.find(V);

+ if (I != ScalarToTreeEntry.end())

+ return VectorizableTree[I->second].get();

return nullptr;

}

@@ -1246,21 +1857,25 @@ template <> struct GraphTraits<BoUpSLP *> {

/// NodeRef has to be a pointer per the GraphWriter.

using NodeRef = TreeEntry *;

+ using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;

/// Add the VectorizableTree to the index iterator to be able to return

/// TreeEntry pointers.

struct ChildIteratorType

- : public iterator_adaptor_base<ChildIteratorType,

- SmallVector<int, 1>::iterator> {

- std::vector<TreeEntry> &VectorizableTree;

+ : public iterator_adaptor_base<

+ ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {

+ ContainerTy &VectorizableTree;

- ChildIteratorType(SmallVector<int, 1>::iterator W,

- std::vector<TreeEntry> &VT)

+ ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,

+ ContainerTy &VT)

: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

- NodeRef operator*() { return &VectorizableTree[*I]; }

+ NodeRef operator*() { return I->UserTE; }

};

- static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }

+ static NodeRef getEntryNode(BoUpSLP &R) {

+ return R.VectorizableTree[0].get();

+ }

static ChildIteratorType child_begin(NodeRef N) {

return {N->UserTreeIndices.begin(), N->Container};

@@ -1272,7 +1887,19 @@ template <> struct GraphTraits<BoUpSLP *> {

/// For the node iterator we just need to turn the TreeEntry iterator into a

/// TreeEntry* iterator so that it dereferences to NodeRef.

- using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;

+ class nodes_iterator {

+ using ItTy = ContainerTy::iterator;

+ ItTy It;

+ public:

+ nodes_iterator(const ItTy &It2) : It(It2) {}

+ NodeRef operator*() { return It->get(); }

+ nodes_iterator operator++() {

+ ++It;

+ return *this;

+ }

+ bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }

+ };

static nodes_iterator nodes_begin(BoUpSLP *R) {

return nodes_iterator(R->VectorizableTree.begin());

@@ -1331,11 +1958,11 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

UserIgnoreList = UserIgnoreLst;

if (!allSameType(Roots))

return;

- buildTree_rec(Roots, 0, -1);

+ buildTree_rec(Roots, 0, EdgeInfo());

// Collect the values that we need to extract from the tree.

- for (TreeEntry &EIdx : VectorizableTree) {

- TreeEntry *Entry = &EIdx;

+ for (auto &TEPtr : VectorizableTree) {

+ TreeEntry *Entry = TEPtr.get();

// No need to handle users of gathered values.

if (Entry->NeedToGather)

@@ -1393,7 +2020,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

}

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

- int UserTreeIdx) {

+ const EdgeInfo &UserTreeIdx) {

assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");

InstructionsState S = getSameOpcode(VL);

@@ -1450,6 +2077,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

E->UserTreeIndices.push_back(UserTreeIdx);

LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue

<< ".\n");

+ E->trySetUserTEOperand(UserTreeIdx, VL, None);

return;

}

@@ -1468,8 +2096,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// If any of the scalars is marked as a value that needs to stay scalar, then

// we need to gather the scalars.

+ // The reduction nodes (stored in UserIgnoreList) also should stay scalar.

for (unsigned i = 0, e = VL.size(); i != e; ++i) {

- if (MustGather.count(VL[i])) {

+ if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) {

LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

newTreeEntry(VL, false, UserTreeIdx);

return;

@@ -1548,7 +2177,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

@@ -1558,7 +2187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(

PH->getIncomingBlock(i)));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

}

@@ -1571,6 +2200,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

++NumOpsWantToKeepOriginalOrder;

newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,

ReuseShuffleIndicies);

+ // This is a special case, as it does not gather, but at the same time

+ // we are not extending buildTree_rec() towards the operands.

+ ValueList Op0;

+ Op0.assign(VL.size(), VL0->getOperand(0));

+ VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);

return;

}

if (!CurrentOrder.empty()) {

@@ -1588,6 +2222,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

++StoredCurrentOrderAndNum->getSecond();

newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,

StoredCurrentOrderAndNum->getFirst());

+ // This is a special case, as it does not gather, but at the same time

+ // we are not extending buildTree_rec() towards the operands.

+ ValueList Op0;

+ Op0.assign(VL.size(), VL0->getOperand(0));

+ VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies);

return;

}

LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");

@@ -1693,7 +2332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

@@ -1702,7 +2341,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

}

@@ -1710,10 +2349,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::FCmp: {

// Check that all of the compares have the same predicate.

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);

Type *ComparedTy = VL0->getOperand(0)->getType();

for (unsigned i = 1, e = VL.size(); i < e; ++i) {

CmpInst *Cmp = cast<CmpInst>(VL[i]);

- if (Cmp->getPredicate() != P0 ||

+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||

Cmp->getOperand(0)->getType() != ComparedTy) {

BS.cancelScheduling(VL, VL0);

newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

@@ -1723,20 +2363,34 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");

- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

- ValueList Operands;

- // Prepare the operand vector.

- for (Value *j : VL)

- Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ ValueList Left, Right;

+ if (cast<CmpInst>(VL0)->isCommutative()) {

+ // Commutative predicate - collect + sort operands of the instructions

+ // so that each side is more likely to have the same opcode.

+ assert(P0 == SwapP0 && "Commutative Predicate mismatch");

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ } else {

+ // Collect operands - commute if it uses the swapped predicate.

+ for (Value *V : VL) {

+ auto *Cmp = cast<CmpInst>(V);

+ Value *LHS = Cmp->getOperand(0);

+ Value *RHS = Cmp->getOperand(1);

+ if (Cmp->getPredicate() != P0)

+ std::swap(LHS, RHS);

+ Left.push_back(LHS);

+ Right.push_back(RHS);

+ }

}

+ buildTree_rec(Left, Depth + 1, {TE, 0});

+ buildTree_rec(Right, Depth + 1, {TE, 1});

return;

}

case Instruction::Select:

+ case Instruction::FNeg:

case Instruction::Add:

case Instruction::FAdd:

case Instruction::Sub:

@@ -1754,17 +2408,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

case Instruction::AShr:

case Instruction::And:

case Instruction::Or:

- case Instruction::Xor:

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n");

+ case Instruction::Xor: {

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");

// Sort operands of the instructions so that each side is more likely to

// have the same opcode.

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

ValueList Left, Right;

- reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right);

- buildTree_rec(Left, Depth + 1, UserTreeIdx);

- buildTree_rec(Right, Depth + 1, UserTreeIdx);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ buildTree_rec(Left, Depth + 1, {TE, 0});

+ buildTree_rec(Right, Depth + 1, {TE, 1});

return;

}

@@ -1774,10 +2428,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

+ }

case Instruction::GetElementPtr: {

// We don't combine GEPs with complicated (nested) indexing.

for (unsigned j = 0; j < VL.size(); ++j) {

@@ -1815,7 +2469,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");

for (unsigned i = 0, e = 2; i < e; ++i) {

ValueList Operands;

@@ -1823,7 +2477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

}

@@ -1837,14 +2491,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");

ValueList Operands;

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(0));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, 0});

return;

}

case Instruction::Call: {

@@ -1860,9 +2514,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

return;

}

Function *Int = CI->getCalledFunction();

- Value *A1I = nullptr;

- if (hasVectorInstrinsicScalarOpd(ID, 1))

- A1I = CI->getArgOperand(1);

+ unsigned NumArgs = CI->getNumArgOperands();

+ SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);

+ for (unsigned j = 0; j != NumArgs; ++j)

+ if (hasVectorInstrinsicScalarOpd(ID, j))

+ ScalarArgs[j] = CI->getArgOperand(j);

for (unsigned i = 1, e = VL.size(); i != e; ++i) {

CallInst *CI2 = dyn_cast<CallInst>(VL[i]);

if (!CI2 || CI2->getCalledFunction() != Int ||

@@ -1874,16 +2530,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

<< "\n");

return;

}

- // ctlz,cttz and powi are special intrinsics whose second argument

- // should be same in order for them to be vectorized.

- if (hasVectorInstrinsicScalarOpd(ID, 1)) {

- Value *A1J = CI2->getArgOperand(1);

- if (A1I != A1J) {

- BS.cancelScheduling(VL, VL0);

- newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

- LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI

- << " argument " << A1I << "!=" << A1J << "\n");

- return;

+ // Some intrinsics have scalar arguments and should be same in order for

+ // them to be vectorized.

+ for (unsigned j = 0; j != NumArgs; ++j) {

+ if (hasVectorInstrinsicScalarOpd(ID, j)) {

+ Value *A1J = CI2->getArgOperand(j);

+ if (ScalarArgs[j] != A1J) {

+ BS.cancelScheduling(VL, VL0);

+ newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI

+ << " argument " << ScalarArgs[j] << "!=" << A1J

+ << "\n");

+ return;

+ }

}

// Verify that the bundle operands are identical between the two calls.

@@ -1899,7 +2558,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

ValueList Operands;

// Prepare the operand vector.

@@ -1907,11 +2566,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

CallInst *CI2 = dyn_cast<CallInst>(j);

Operands.push_back(CI2->getArgOperand(i));

}

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

}

- case Instruction::ShuffleVector:

+ case Instruction::ShuffleVector: {

// If this is not an alternate sequence of opcode like add-sub

// then do not vectorize this instruction.

if (!S.isAltShuffle()) {

@@ -1920,15 +2579,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");

return;

}

- newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

+ auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);

LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

// Reorder operands if reordering would enable vectorization.

if (isa<BinaryOperator>(VL0)) {

ValueList Left, Right;

- reorderAltShuffleOperands(S, VL, Left, Right);

- buildTree_rec(Left, Depth + 1, UserTreeIdx);

- buildTree_rec(Right, Depth + 1, UserTreeIdx);

+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);

+ buildTree_rec(Left, Depth + 1, {TE, 0});

+ buildTree_rec(Right, Depth + 1, {TE, 1});

return;

}

@@ -1938,10 +2597,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1, UserTreeIdx);

+ buildTree_rec(Operands, Depth + 1, {TE, i});

}

return;

+ }

default:

BS.cancelScheduling(VL, VL0);

newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);

@@ -2223,6 +2882,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0);

return ReuseShuffleCost + VecCost - ScalarCost;

}

+ case Instruction::FNeg:

case Instruction::Add:

case Instruction::FAdd:

case Instruction::Sub:

@@ -2260,7 +2920,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

ConstantInt *CInt0 = nullptr;

for (unsigned i = 0, e = VL.size(); i < e; ++i) {

const Instruction *I = cast<Instruction>(VL[i]);

- ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(1));

+ unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;

+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));

if (!CInt) {

Op2VK = TargetTransformInfo::OK_AnyValue;

Op2VP = TargetTransformInfo::OP_None;

@@ -2413,31 +3074,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

}

-bool BoUpSLP::isFullyVectorizableTinyTree() {

+bool BoUpSLP::isFullyVectorizableTinyTree() const {

LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "

<< VectorizableTree.size() << " is fully vectorizable .\n");

// We only handle trees of heights 1 and 2.

- if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)

+ if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather)

return true;

if (VectorizableTree.size() != 2)

return false;

// Handle splat and all-constants stores.

- if (!VectorizableTree[0].NeedToGather &&

- (allConstant(VectorizableTree[1].Scalars) ||

- isSplat(VectorizableTree[1].Scalars)))

+ if (!VectorizableTree[0]->NeedToGather &&

+ (allConstant(VectorizableTree[1]->Scalars) ||

+ isSplat(VectorizableTree[1]->Scalars)))

return true;

// Gathering cost would be too much for tiny trees.

- if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)

+ if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather)

return false;

return true;

}

-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {

+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {

// We can vectorize the tree if its size is greater than or equal to the

// minimum size specified by the MinTreeSize command line option.

if (VectorizableTree.size() >= MinTreeSize)

@@ -2457,19 +3118,19 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {

return true;

}

-int BoUpSLP::getSpillCost() {

+int BoUpSLP::getSpillCost() const {

// Walk from the bottom of the tree to the top, tracking which values are

// live. When we see a call instruction that is not part of our tree,

// query TTI to see if there is a cost to keeping values live over it

// (for example, if spills and fills are required).

- unsigned BundleWidth = VectorizableTree.front().Scalars.size();

+ unsigned BundleWidth = VectorizableTree.front()->Scalars.size();

int Cost = 0;

SmallPtrSet<Instruction*, 4> LiveValues;

Instruction *PrevInst = nullptr;

- for (const auto &N : VectorizableTree) {

- Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);

+ for (const auto &TEPtr : VectorizableTree) {

+ Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);

if (!Inst)

continue;

@@ -2494,6 +3155,7 @@ int BoUpSLP::getSpillCost() {

});

// Now find the sequence of instructions between PrevInst and Inst.

+ unsigned NumCalls = 0;

BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),

PrevInstIt =

PrevInst->getIterator().getReverse();

@@ -2506,16 +3168,19 @@ int BoUpSLP::getSpillCost() {

// Debug informations don't impact spill cost.

if ((isa<CallInst>(&*PrevInstIt) &&

!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&

- &*PrevInstIt != PrevInst) {

- SmallVector<Type*, 4> V;

- for (auto *II : LiveValues)

- V.push_back(VectorType::get(II->getType(), BundleWidth));

- Cost += TTI->getCostOfKeepingLiveOverCall(V);

- }

+ &*PrevInstIt != PrevInst)

+ NumCalls++;

++PrevInstIt;

}

+ if (NumCalls) {

+ SmallVector<Type*, 4> V;

+ for (auto *II : LiveValues)

+ V.push_back(VectorType::get(II->getType(), BundleWidth));

+ Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);

+ }

PrevInst = Inst;

}

@@ -2527,10 +3192,10 @@ int BoUpSLP::getTreeCost() {

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "

<< VectorizableTree.size() << ".\n");

- unsigned BundleWidth = VectorizableTree[0].Scalars.size();

+ unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {

- TreeEntry &TE = VectorizableTree[I];

+ TreeEntry &TE = *VectorizableTree[I].get();

// We create duplicate tree entries for gather sequences that have multiple

// uses. However, we should not compute the cost of duplicate sequences.

@@ -2545,10 +3210,11 @@ int BoUpSLP::getTreeCost() {

// existing heuristics based on tree size may yield different results.

if (TE.NeedToGather &&

- std::any_of(std::next(VectorizableTree.begin(), I + 1),

- VectorizableTree.end(), [TE](TreeEntry &Entry) {

- return Entry.NeedToGather && Entry.isSame(TE.Scalars);

- }))

+ std::any_of(

+ std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(),

+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {

+ return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars);

+ }))

continue;

int C = getEntryCost(&TE);

@@ -2575,7 +3241,7 @@ int BoUpSLP::getTreeCost() {

// extend the extracted value back to the original type. Here, we account

// for the extract and the added cost of the sign extend if needed.

auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);

- auto *ScalarRoot = VectorizableTree[0].Scalars[0];

+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

if (MinBWs.count(ScalarRoot)) {

auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

auto Extend =

@@ -2608,17 +3274,17 @@ int BoUpSLP::getTreeCost() {

}

int BoUpSLP::getGatherCost(Type *Ty,

- const DenseSet<unsigned> &ShuffledIndices) {

+ const DenseSet<unsigned> &ShuffledIndices) const {

int Cost = 0;

for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)

if (!ShuffledIndices.count(i))

Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);

if (!ShuffledIndices.empty())

- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);

+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);

return Cost;

}

-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {

+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

// Find the type of the operands in VL.

Type *ScalarTy = VL[0]->getType();

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

@@ -2638,221 +3304,19 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {

return getGatherCost(VecTy, ShuffledElements);

}

-// Reorder commutative operations in alternate shuffle if the resulting vectors

-// are consecutive loads. This would allow us to vectorize the tree.

-// If we have something like-

-// load a[0] - load b[0]

-// load b[1] + load a[1]

-// load a[2] - load b[2]

-// load a[3] + load b[3]

-// Reordering the second load b[1] load a[1] would allow us to vectorize this

-// code.

-void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S,

- ArrayRef<Value *> VL,

- SmallVectorImpl<Value *> &Left,

- SmallVectorImpl<Value *> &Right) {

- // Push left and right operands of binary operation into Left and Right

- for (Value *V : VL) {

- auto *I = cast<Instruction>(V);

- assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector");

- Left.push_back(I->getOperand(0));

- Right.push_back(I->getOperand(1));

- }

- // Reorder if we have a commutative operation and consecutive access

- // are on either side of the alternate instructions.

- for (unsigned j = 0; j < VL.size() - 1; ++j) {

- if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {

- if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {

- Instruction *VL1 = cast<Instruction>(VL[j]);

- Instruction *VL2 = cast<Instruction>(VL[j + 1]);

- if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {

- std::swap(Left[j], Right[j]);

- continue;

- } else if (VL2->isCommutative() &&

- isConsecutiveAccess(L, L1, *DL, *SE)) {

- std::swap(Left[j + 1], Right[j + 1]);

- continue;

- }

- // else unchanged

- }

- if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {

- if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {

- Instruction *VL1 = cast<Instruction>(VL[j]);

- Instruction *VL2 = cast<Instruction>(VL[j + 1]);

- if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {

- std::swap(Left[j], Right[j]);

- continue;

- } else if (VL2->isCommutative() &&

- isConsecutiveAccess(L, L1, *DL, *SE)) {

- std::swap(Left[j + 1], Right[j + 1]);

- continue;

- }

- // else unchanged

- }

-// Return true if I should be commuted before adding it's left and right

-// operands to the arrays Left and Right.

-//

-// The vectorizer is trying to either have all elements one side being

-// instruction with the same opcode to enable further vectorization, or having

-// a splat to lower the vectorizing cost.

-static bool shouldReorderOperands(

- int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,

- ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,

- bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {

- VLeft = I.getOperand(0);

- VRight = I.getOperand(1);

- // If we have "SplatRight", try to see if commuting is needed to preserve it.

- if (SplatRight) {

- if (VRight == Right[i - 1])

- // Preserve SplatRight

- return false;

- if (VLeft == Right[i - 1]) {

- // Commuting would preserve SplatRight, but we don't want to break

- // SplatLeft either, i.e. preserve the original order if possible.

- // (FIXME: why do we care?)

- if (SplatLeft && VLeft == Left[i - 1])

- return false;

- return true;

- }

- // Symmetrically handle Right side.

- if (SplatLeft) {

- if (VLeft == Left[i - 1])

- // Preserve SplatLeft

- return false;

- if (VRight == Left[i - 1])

- return true;

- }

- Instruction *ILeft = dyn_cast<Instruction>(VLeft);

- Instruction *IRight = dyn_cast<Instruction>(VRight);

- // If we have "AllSameOpcodeRight", try to see if the left operands preserves

- // it and not the right, in this case we want to commute.

- if (AllSameOpcodeRight) {

- unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();

- if (IRight && RightPrevOpcode == IRight->getOpcode())

- // Do not commute, a match on the right preserves AllSameOpcodeRight

- return false;

- if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {

- // We have a match and may want to commute, but first check if there is

- // not also a match on the existing operands on the Left to preserve

- // AllSameOpcodeLeft, i.e. preserve the original order if possible.

- // (FIXME: why do we care?)

- if (AllSameOpcodeLeft && ILeft &&

- cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())

- return false;

- return true;

- }

- // Symmetrically handle Left side.

- if (AllSameOpcodeLeft) {

- unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();

- if (ILeft && LeftPrevOpcode == ILeft->getOpcode())

- return false;

- if (IRight && LeftPrevOpcode == IRight->getOpcode())

- return true;

- }

- return false;

-void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,

- ArrayRef<Value *> VL,

- SmallVectorImpl<Value *> &Left,

- SmallVectorImpl<Value *> &Right) {

- if (!VL.empty()) {

- // Peel the first iteration out of the loop since there's nothing

- // interesting to do anyway and it simplifies the checks in the loop.

- auto *I = cast<Instruction>(VL[0]);

- Value *VLeft = I->getOperand(0);

- Value *VRight = I->getOperand(1);

- if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))

- // Favor having instruction to the right. FIXME: why?

- std::swap(VLeft, VRight);

- Left.push_back(VLeft);

- Right.push_back(VRight);

- }

- // Keep track if we have instructions with all the same opcode on one side.

- bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);

- bool AllSameOpcodeRight = isa<Instruction>(Right[0]);

- // Keep track if we have one side with all the same value (broadcast).

- bool SplatLeft = true;

- bool SplatRight = true;

- for (unsigned i = 1, e = VL.size(); i != e; ++i) {

- Instruction *I = cast<Instruction>(VL[i]);

- assert(((I->getOpcode() == Opcode && I->isCommutative()) ||

- (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&

- "Can only process commutative instruction");

- // Commute to favor either a splat or maximizing having the same opcodes on

- // one side.

- Value *VLeft;

- Value *VRight;

- if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,

- AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,

- VRight)) {

- Left.push_back(VRight);

- Right.push_back(VLeft);

- } else {

- Left.push_back(VLeft);

- Right.push_back(VRight);

- }

- // Update Splat* and AllSameOpcode* after the insertion.

- SplatRight = SplatRight && (Right[i - 1] == Right[i]);

- SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);

- AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&

- (cast<Instruction>(Left[i - 1])->getOpcode() ==

- cast<Instruction>(Left[i])->getOpcode());

- AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&

- (cast<Instruction>(Right[i - 1])->getOpcode() ==

- cast<Instruction>(Right[i])->getOpcode());

- }

- // If one operand end up being broadcast, return this operand order.

- if (SplatRight || SplatLeft)

+// Perform operand reordering on the instructions in VL and return the reordered

+// operands in Left and Right.

+void BoUpSLP::reorderInputsAccordingToOpcode(

+ ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,

+ SmallVectorImpl<Value *> &Right, const DataLayout &DL,

+ ScalarEvolution &SE) {

+ if (VL.empty())

return;

- // Finally check if we can get longer vectorizable chain by reordering

- // without breaking the good operand order detected above.

- // E.g. If we have something like-

- // load a[0] load b[0]

- // load b[1] load a[1]

- // load a[2] load b[2]

- // load a[3] load b[3]

- // Reordering the second load b[1] load a[1] would allow us to vectorize

- // this code and we still retain AllSameOpcode property.

- // FIXME: This load reordering might break AllSameOpcode in some rare cases

- // such as-

- // add a[0],c[0] load b[0]

- // add a[1],c[2] load b[1]

- // b[2] load b[2]

- // add a[3],c[3] load b[3]

- for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) {

- if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {

- if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {

- if (isConsecutiveAccess(L, L1, *DL, *SE)) {

- std::swap(Left[j + 1], Right[j + 1]);

- continue;

- }

- if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {

- if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {

- if (isConsecutiveAccess(L, L1, *DL, *SE)) {

- std::swap(Left[j + 1], Right[j + 1]);

- continue;

- }

- // else unchanged

- }

+ VLOperands Ops(VL, DL, SE);

+ // Reorder the operands in place.

+ Ops.reorder();

+ Left = Ops.getVL(0);

+ Right = Ops.getVL(1);

}

void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL,

@@ -3082,13 +3546,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

continue;

}

- // Prepare the operand vector.

- for (Value *V : E->Scalars)

- Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));

Builder.SetInsertPoint(IBB->getTerminator());

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

- Value *Vec = vectorizeTree(Operands);

+ Value *Vec = vectorizeTree(E->getOperand(i));

NewPhi->addIncoming(Vec, IBB);

}

@@ -3099,7 +3559,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::ExtractElement: {

if (!E->NeedToGather) {

- Value *V = VL0->getOperand(0);

+ Value *V = E->getSingleOperand(0);

if (!E->ReorderIndices.empty()) {

OrdersType Mask;

inversePermutation(E->ReorderIndices, Mask);

@@ -3132,11 +3592,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

case Instruction::ExtractValue: {

if (!E->NeedToGather) {

- LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));

+ LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));

Builder.SetInsertPoint(LI);

PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);

- LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());

+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());

Value *NewV = propagateMetadata(V, E->Scalars);

if (!E->ReorderIndices.empty()) {

OrdersType Mask;

@@ -3177,13 +3637,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- ValueList INVL;

- for (Value *V : E->Scalars)

- INVL.push_back(cast<Instruction>(V)->getOperand(0));

setInsertPointAfterBundle(E->Scalars, S);

- Value *InVec = vectorizeTree(INVL);

+ Value *InVec = vectorizeTree(E->getOperand(0));

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

@@ -3202,16 +3658,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

case Instruction::FCmp:

case Instruction::ICmp: {

- ValueList LHSV, RHSV;

- for (Value *V : E->Scalars) {

- LHSV.push_back(cast<Instruction>(V)->getOperand(0));

- RHSV.push_back(cast<Instruction>(V)->getOperand(1));

- }

setInsertPointAfterBundle(E->Scalars, S);

- Value *L = vectorizeTree(LHSV);

- Value *R = vectorizeTree(RHSV);

+ Value *L = vectorizeTree(E->getOperand(0));

+ Value *R = vectorizeTree(E->getOperand(1));

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

@@ -3235,31 +3685,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::Select: {

- ValueList TrueVec, FalseVec, CondVec;

- for (Value *V : E->Scalars) {

- CondVec.push_back(cast<Instruction>(V)->getOperand(0));

- TrueVec.push_back(cast<Instruction>(V)->getOperand(1));

- FalseVec.push_back(cast<Instruction>(V)->getOperand(2));

+ setInsertPointAfterBundle(E->Scalars, S);

+ Value *Cond = vectorizeTree(E->getOperand(0));

+ Value *True = vectorizeTree(E->getOperand(1));

+ Value *False = vectorizeTree(E->getOperand(2));

+ if (E->VectorizedValue) {

+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

+ return E->VectorizedValue;

}

+ Value *V = Builder.CreateSelect(Cond, True, False);

+ if (NeedToShuffleReuses) {

+ V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

+ E->ReuseShuffleIndices, "shuffle");

+ }

+ E->VectorizedValue = V;

+ ++NumVectorInstructions;

+ return V;

+ }

+ case Instruction::FNeg: {

setInsertPointAfterBundle(E->Scalars, S);

- Value *Cond = vectorizeTree(CondVec);

- Value *True = vectorizeTree(TrueVec);

- Value *False = vectorizeTree(FalseVec);

+ Value *Op = vectorizeTree(E->getOperand(0));

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

return E->VectorizedValue;

}

- Value *V = Builder.CreateSelect(Cond, True, False);

+ Value *V = Builder.CreateUnOp(

+ static_cast<Instruction::UnaryOps>(S.getOpcode()), Op);

+ propagateIRFlags(V, E->Scalars, VL0);

+ if (auto *I = dyn_cast<Instruction>(V))

+ V = propagateMetadata(I, E->Scalars);

if (NeedToShuffleReuses) {

V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),

E->ReuseShuffleIndices, "shuffle");

}

E->VectorizedValue = V;

++NumVectorInstructions;

return V;

}

case Instruction::Add:

@@ -3280,21 +3748,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- ValueList LHSVL, RHSVL;

- if (isa<BinaryOperator>(VL0) && VL0->isCommutative())

- reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL,

- RHSVL);

- else

- for (Value *V : E->Scalars) {

- auto *I = cast<Instruction>(V);

- LHSVL.push_back(I->getOperand(0));

- RHSVL.push_back(I->getOperand(1));

- }

setInsertPointAfterBundle(E->Scalars, S);

- Value *LHS = vectorizeTree(LHSVL);

- Value *RHS = vectorizeTree(RHSVL);

+ Value *LHS = vectorizeTree(E->getOperand(0));

+ Value *RHS = vectorizeTree(E->getOperand(1));

if (E->VectorizedValue) {

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");

@@ -3341,7 +3798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));

unsigned Alignment = LI->getAlignment();

- LI = Builder.CreateLoad(VecPtr);

+ LI = Builder.CreateLoad(VecTy, VecPtr);

if (!Alignment) {

Alignment = DL->getABITypeAlignment(ScalarLoadTy);

}

@@ -3367,13 +3824,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

unsigned Alignment = SI->getAlignment();

unsigned AS = SI->getPointerAddressSpace();

- ValueList ScalarStoreValues;

- for (Value *V : E->Scalars)

- ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());

setInsertPointAfterBundle(E->Scalars, S);

- Value *VecValue = vectorizeTree(ScalarStoreValues);

+ Value *VecValue = vectorizeTree(E->getOperand(0));

Value *ScalarPtr = SI->getPointerOperand();

Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));

StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);

@@ -3400,20 +3853,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

case Instruction::GetElementPtr: {

setInsertPointAfterBundle(E->Scalars, S);

- ValueList Op0VL;

- for (Value *V : E->Scalars)

- Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));

- Value *Op0 = vectorizeTree(Op0VL);

+ Value *Op0 = vectorizeTree(E->getOperand(0));

std::vector<Value *> OpVecs;

for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;

++j) {

- ValueList OpVL;

- for (Value *V : E->Scalars)

- OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));

- Value *OpVec = vectorizeTree(OpVL);

+ Value *OpVec = vectorizeTree(E->getOperand(j));

OpVecs.push_back(OpVec);

}

@@ -3443,20 +3888,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

std::vector<Value *> OpVecs;

for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {

ValueList OpVL;

- // ctlz,cttz and powi are special intrinsics whose second argument is

- // a scalar. This argument should not be vectorized.

- if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {

+ // Some intrinsics have scalar arguments. This argument should not be

+ // vectorized.

+ if (hasVectorInstrinsicScalarOpd(IID, j)) {

CallInst *CEI = cast<CallInst>(VL0);

ScalarArg = CEI->getArgOperand(j);

OpVecs.push_back(CEI->getArgOperand(j));

continue;

}

- for (Value *V : E->Scalars) {

- CallInst *CEI = cast<CallInst>(V);

- OpVL.push_back(CEI->getArgOperand(j));

- }

- Value *OpVec = vectorizeTree(OpVL);

+ Value *OpVec = vectorizeTree(E->getOperand(j));

LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");

OpVecs.push_back(OpVec);

}

@@ -3485,7 +3926,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

return V;

}

case Instruction::ShuffleVector: {

- ValueList LHSVL, RHSVL;

assert(S.isAltShuffle() &&

((Instruction::isBinaryOp(S.getOpcode()) &&

Instruction::isBinaryOp(S.getAltOpcode())) ||

@@ -3495,16 +3935,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

Value *LHS, *RHS;

if (Instruction::isBinaryOp(S.getOpcode())) {

- reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL);

setInsertPointAfterBundle(E->Scalars, S);

- LHS = vectorizeTree(LHSVL);

- RHS = vectorizeTree(RHSVL);

+ LHS = vectorizeTree(E->getOperand(0));

+ RHS = vectorizeTree(E->getOperand(1));

} else {

- ValueList INVL;

- for (Value *V : E->Scalars)

- INVL.push_back(cast<Instruction>(V)->getOperand(0));

setInsertPointAfterBundle(E->Scalars, S);

- LHS = vectorizeTree(INVL);

+ LHS = vectorizeTree(E->getOperand(0));

}

if (E->VectorizedValue) {

@@ -3578,20 +4014,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

}

Builder.SetInsertPoint(&F->getEntryBlock().front());

- auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);

+ auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());

// If the vectorized tree can be rewritten in a smaller type, we truncate the

// vectorized root. InstCombine will then rewrite the entire expression. We

// sign extend the extracted values below.

- auto *ScalarRoot = VectorizableTree[0].Scalars[0];

+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

if (MinBWs.count(ScalarRoot)) {

if (auto *I = dyn_cast<Instruction>(VectorRoot))

Builder.SetInsertPoint(&*++BasicBlock::iterator(I));

- auto BundleWidth = VectorizableTree[0].Scalars.size();

+ auto BundleWidth = VectorizableTree[0]->Scalars.size();

auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

auto *VecTy = VectorType::get(MinTy, BundleWidth);

auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);

- VectorizableTree[0].VectorizedValue = Trunc;

+ VectorizableTree[0]->VectorizedValue = Trunc;

}

LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()

@@ -3687,8 +4123,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

}

// For each vectorized value:

- for (TreeEntry &EIdx : VectorizableTree) {

- TreeEntry *Entry = &EIdx;

+ for (auto &TEPtr : VectorizableTree) {

+ TreeEntry *Entry = TEPtr.get();

// No need to handle users of gathered values.

if (Entry->NeedToGather)

@@ -3721,7 +4157,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

Builder.ClearInsertionPoint();

- return VectorizableTree[0].VectorizedValue;

+ return VectorizableTree[0]->VectorizedValue;

}

void BoUpSLP::optimizeGatherSequence() {

@@ -3767,10 +4203,10 @@ void BoUpSLP::optimizeGatherSequence() {

// Sort blocks by domination. This ensures we visit a block after all blocks

// dominating it are visited.

- std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),

- [this](const DomTreeNode *A, const DomTreeNode *B) {

- return DT->properlyDominates(A, B);

- });

+ llvm::stable_sort(CSEWorkList,

+ [this](const DomTreeNode *A, const DomTreeNode *B) {

+ return DT->properlyDominates(A, B);

+ });

// Perform O(N^2) search over the gather sequences and merge identical

// instructions. TODO: We can further optimize this scan if we split the

@@ -3989,7 +4425,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

<< "\n");

return true;

}

- UpIter++;

+ ++UpIter;

}

if (DownIter != LowerEnd) {

if (&*DownIter == I) {

@@ -4003,7 +4439,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

<< "\n");

return true;

}

- DownIter++;

+ ++DownIter;

}

assert((UpIter != UpperEnd || DownIter != LowerEnd) &&

"instruction not found in block");

@@ -4253,7 +4689,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

BS->ScheduleStart = nullptr;

}

-unsigned BoUpSLP::getVectorElementSize(Value *V) {

+unsigned BoUpSLP::getVectorElementSize(Value *V) const {

// If V is a store, just return the width of the stored value without

// traversing the expression tree. This is the common case.

if (auto *Store = dyn_cast<StoreInst>(V))

@@ -4390,7 +4826,7 @@ void BoUpSLP::computeMinimumValueSizes() {

return;

// We only attempt to truncate integer expressions.

- auto &TreeRoot = VectorizableTree[0].Scalars;

+ auto &TreeRoot = VectorizableTree[0]->Scalars;

auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());

if (!TreeRootIT)

return;

@@ -4411,8 +4847,8 @@ void BoUpSLP::computeMinimumValueSizes() {

// Collect the scalar values of the vectorizable expression. We will use this

// context to determine which values can be demoted. If we see a truncation,

// we mark it as seeding another demotion.

- for (auto &Entry : VectorizableTree)

- Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());

+ for (auto &EntryPtr : VectorizableTree)

+ Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());

// Ensure the roots of the vectorizable tree don't form a cycle. They must

// have a single external user that is not in the vectorizable tree.

@@ -4746,38 +5182,29 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

BoUpSLP::ValueSet VectorizedStores;

bool Changed = false;

- // Do a quadratic search on all of the given stores in reverse order and find

- // all of the pairs of stores that follow each other.

- SmallVector<unsigned, 16> IndexQueue;

- unsigned E = Stores.size();

- IndexQueue.resize(E - 1);

- for (unsigned I = E; I > 0; --I) {

- unsigned Idx = I - 1;

- // If a store has multiple consecutive store candidates, search Stores

- // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

- // This is because usually pairing with immediate succeeding or preceding

- // candidate create the best chance to find slp vectorization opportunity.

- unsigned Offset = 1;

- unsigned Cnt = 0;

- for (unsigned J = 0; J < E - 1; ++J, ++Offset) {

- if (Idx >= Offset) {

- IndexQueue[Cnt] = Idx - Offset;

- ++Cnt;

- }

- if (Idx + Offset < E) {

- IndexQueue[Cnt] = Idx + Offset;

- ++Cnt;

- }

+ auto &&FindConsecutiveAccess =

+ [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) {

+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))

+ return false;

- for (auto K : IndexQueue) {

- if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {

Tails.insert(Stores[Idx]);

Heads.insert(Stores[K]);

ConsecutiveChain[Stores[K]] = Stores[Idx];

+ return true;

+ };

+ // Do a quadratic search on all of the given stores in reverse order and find

+ // all of the pairs of stores that follow each other.

+ int E = Stores.size();

+ for (int Idx = E - 1; Idx >= 0; --Idx) {

+ // If a store has multiple consecutive store candidates, search according

+ // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

+ // This is because usually pairing with immediate succeeding or preceding

+ // candidate create the best chance to find slp vectorization opportunity.

+ for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset)

+ if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||

+ (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))

break;

- }

}

// For stores that start but don't end a link in the chain:

@@ -5740,6 +6167,9 @@ public:

unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);

Value *VectorizedTree = nullptr;

+ // FIXME: Fast-math-flags should be set based on the instructions in the

+ // reduction (not all of 'fast' are required).

IRBuilder<> Builder(cast<Instruction>(ReductionRoot));

FastMathFlags Unsafe;

Unsafe.setFast();

@@ -5929,10 +6359,14 @@ private:

assert(isPowerOf2_32(ReduxWidth) &&

"We only handle power-of-two reductions for now");

- if (!IsPairwiseReduction)

+ if (!IsPairwiseReduction) {

+ // FIXME: The builder should use an FMF guard. It should not be hard-coded

+ // to 'fast'.

+ assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");

return createSimpleTargetReduction(

Builder, TTI, ReductionData.getOpcode(), VectorizedValue,

ReductionData.getFlags(), ReductionOps.back());

+ }

Value *TmpVec = VectorizedValue;

for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {

@@ -6256,7 +6690,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

}

// Sort by type.

- std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);

+ llvm::stable_sort(Incoming, PhiTypeSorterFunc);

// Try to vectorize elements base on their type.

for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),

@@ -6297,7 +6731,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

SmallVector<WeakVH, 8> PostProcessInstructions;

SmallDenseSet<Instruction *, 4> KeyNodes;

- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {

+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

// We may go through BB multiple times so skip the one we have checked.

if (!VisitedInstrs.insert(&*it).second) {

if (it->use_empty() && KeyNodes.count(&*it) > 0 &&

diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 15d38ac9c84c..0ca6a6b93cfd 100644
--- a/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h

@@ -1,9 +1,8 @@

//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -30,9 +29,6 @@ class VPRecipeBuilder {

/// Target Library Info.

const TargetLibraryInfo *TLI;

- /// Target Transform Info.

- const TargetTransformInfo *TTI;

/// The legality analysis.

LoopVectorizationLegality *Legal;

@@ -105,11 +101,9 @@ public:

public:

VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,

- const TargetTransformInfo *TTI,

LoopVectorizationLegality *Legal,

LoopVectorizationCostModel &CM, VPBuilder &Builder)

- : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),

- Builder(Builder) {}

+ : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}

/// Check if a recipe can be create for \p I withing the given VF \p Range.

/// If a recipe can be created, it adds it to \p VPBB.

diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 05a5400beb4e..517d759d7bfc 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp

@@ -1,9 +1,8 @@

//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

@@ -374,10 +373,9 @@ void VPlan::execute(VPTransformState *State) {

BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;

BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();

assert(VectorHeaderBB && "Loop preheader does not have a single successor.");

- BasicBlock *VectorLatchBB = VectorHeaderBB;

// 1. Make room to generate basic-blocks inside loop body if needed.

- VectorLatchBB = VectorHeaderBB->splitBasicBlock(

+ BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(

VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");

Loop *L = State->LI->getLoopFor(VectorHeaderBB);

L->addBasicBlockToLoop(VectorLatchBB, *State->LI);

@@ -561,6 +559,19 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {

bumpIndent(1);

OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\"";

bumpIndent(1);

+ // Dump the block predicate.

+ const VPValue *Pred = BasicBlock->getPredicate();

+ if (Pred) {

+ OS << " +\n" << Indent << " \"BlockPredicate: ";

+ if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {

+ PredI->printAsOperand(OS);

+ OS << " (" << DOT::EscapeString(PredI->getParent()->getName())

+ << ")\\l\"";

+ } else

+ Pred->printAsOperand(OS);

+ }

for (const VPRecipeBase &Recipe : *BasicBlock)

Recipe.print(OS, Indent);

diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index 5c1b4a83c30e..8a06412ad590 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h

@@ -1,9 +1,8 @@

//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -353,6 +352,9 @@ private:

/// Successor selector, null for zero or single successor blocks.

VPValue *CondBit = nullptr;

+ /// Current block predicate - null if the block does not need a predicate.

+ VPValue *Predicate = nullptr;

/// Add \p Successor as the last successor to this block.

void appendSuccessor(VPBlockBase *Successor) {

assert(Successor && "Cannot add nullptr successor!");

@@ -491,6 +493,12 @@ public:

void setCondBit(VPValue *CV) { CondBit = CV; }

+ VPValue *getPredicate() { return Predicate; }

+ const VPValue *getPredicate() const { return Predicate; }

+ void setPredicate(VPValue *Pred) { Predicate = Pred; }

/// Set a given VPBlockBase \p Successor as the single successor of this

/// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.

/// This VPBlockBase must have no successors.

@@ -521,6 +529,15 @@ public:

appendPredecessor(Pred);

}

+ /// Remove all the predecessor of this block.

+ void clearPredecessors() { Predecessors.clear(); }

+ /// Remove all the successors of this block and set to null its condition bit

+ void clearSuccessors() {

+ Successors.clear();

+ CondBit = nullptr;

+ }

/// The method which generates the output IR that correspond to this

/// VPBlockBase, thereby "executing" the VPlan.

virtual void execute(struct VPTransformState *State) = 0;

@@ -1491,6 +1508,41 @@ public:

From->removeSuccessor(To);

To->removePredecessor(From);

}

+ /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.

+ static bool isBackEdge(const VPBlockBase *FromBlock,

+ const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {

+ assert(FromBlock->getParent() == ToBlock->getParent() &&

+ FromBlock->getParent() && "Must be in same region");

+ const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);

+ const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);

+ if (!FromLoop || !ToLoop || FromLoop != ToLoop)

+ return false;

+ // A back-edge is a branch from the loop latch to its header.

+ return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();

+ }

+ /// Returns true if \p Block is a loop latch

+ static bool blockIsLoopLatch(const VPBlockBase *Block,

+ const VPLoopInfo *VPLInfo) {

+ if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))

+ return ParentVPL->isLoopLatch(Block);

+ return false;

+ }

+ /// Count and return the number of succesors of \p PredBlock excluding any

+ /// backedges.

+ static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,

+ VPLoopInfo *VPLI) {

+ unsigned Count = 0;

+ for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {

+ if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))

+ Count++;

+ }

+ return Count;

+ }

};

class VPInterleavedAccessInfo {

diff --git a/lib/Transforms/Vectorize/VPlanDominatorTree.h b/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 1b81097b6d31..19f5d2c00c60 100644
--- a/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/lib/Transforms/Vectorize/VPlanDominatorTree.h

@@ -1,9 +1,8 @@

//===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 0f42694e193b..df96f67288f1 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

@@ -1,9 +1,8 @@

//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

@@ -64,7 +63,9 @@ private:

void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);

void fixPhiNodes();

VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);

+#ifndef NDEBUG

bool isExternalDef(Value *Val);

+#endif

VPValue *getOrCreateVPOperand(Value *IRVal);

void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);

@@ -119,6 +120,7 @@ VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {

return VPBB;

}

+#ifndef NDEBUG

// Return true if \p Val is considered an external definition. An external

// definition is either:

// 1. A Value that is not an Instruction. This will be refined in the future.

@@ -154,6 +156,7 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {

// Check whether Instruction definition is in loop body.

return !TheLoop->contains(Inst);

}

+#endif

// Create a new VPValue or retrieve an existing one for the Instruction's

// operand \p IRVal. This function must only be used to create/retrieve VPValues

diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index 3f11dcb5164d..238ee7e6347c 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.h

@@ -1,9 +1,8 @@

//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
index 3ad7fc7e7b96..7ed7d21b6caa 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp

@@ -1,9 +1,8 @@

//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
index ae549c6871b3..79a23c33184f 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGTransforms.h
+++ b/lib/Transforms/Vectorize/VPlanHCFGTransforms.h

@@ -1,9 +1,8 @@

//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanLoopInfo.h b/lib/Transforms/Vectorize/VPlanLoopInfo.h
index 5c2485fc2145..5208f2d58e2b 100644
--- a/lib/Transforms/Vectorize/VPlanLoopInfo.h
+++ b/lib/Transforms/Vectorize/VPlanLoopInfo.h

@@ -1,9 +1,8 @@

//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanPredicator.cpp b/lib/Transforms/Vectorize/VPlanPredicator.cpp
new file mode 100644
index 000000000000..7a80f3ff80a5
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanPredicator.cpp

@@ -0,0 +1,248 @@

+//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+///

+/// \file

+/// This file implements the VPlanPredicator class which contains the public

+/// interfaces to predicate and linearize the VPlan region.

+///

+//===----------------------------------------------------------------------===//

+#include "VPlanPredicator.h"

+#include "VPlan.h"

+#include "llvm/ADT/DepthFirstIterator.h"

+#include "llvm/ADT/GraphTraits.h"

+#include "llvm/ADT/PostOrderIterator.h"

+#include "llvm/Support/Debug.h"

+#include "llvm/Support/raw_ostream.h"

+#define DEBUG_TYPE "VPlanPredicator"

+using namespace llvm;

+// Generate VPInstructions at the beginning of CurrBB that calculate the

+// predicate being propagated from PredBB to CurrBB depending on the edge type

+// between them. For example if:

+// i. PredBB is controlled by predicate %BP, and

+// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition

+// bit value %CBV then this function will generate the following two

+// VPInstructions at the start of CurrBB:

+// %IntermediateVal = not %CBV

+// %FinalVal = and %BP %IntermediateVal

+// It returns %FinalVal.

+VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,

+ VPBasicBlock *CurrBB) {

+ VPValue *CBV = PredBB->getCondBit();

+ // Set the intermediate value - this is either 'CBV', or 'not CBV'

+ // depending on the edge type.

+ EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);

+ VPValue *IntermediateVal = nullptr;

+ switch (ET) {

+ case EdgeType::TRUE_EDGE:

+ // CurrBB is the true successor of PredBB - nothing to do here.

+ IntermediateVal = CBV;

+ break;

+ case EdgeType::FALSE_EDGE:

+ // CurrBB is the False successor of PredBB - compute not of CBV.

+ IntermediateVal = Builder.createNot(CBV);

+ break;

+ }

+ // Now AND intermediate value with PredBB's block predicate if it has one.

+ VPValue *BP = PredBB->getPredicate();

+ if (BP)

+ return Builder.createAnd(BP, IntermediateVal);

+ else

+ return IntermediateVal;

+// Generate a tree of ORs for all IncomingPredicates in WorkList.

+// Note: This function destroys the original Worklist.

+//

+// P1 P2 P3 P4 P5

+// \ / \ / /

+// OR1 OR2 /

+// \ | /

+// \ +/-+

+// \ / |

+// OR3 |

+// \ |

+// OR4 <- Returns this

+// |

+//

+// The algorithm uses a worklist of predicates as its main data structure.

+// We pop a pair of values from the front (e.g. P1 and P2), generate an OR

+// (in this example OR1), and push it back. In this example the worklist

+// contains {P3, P4, P5, OR1}.

+// The process iterates until we have only one element in the Worklist (OR4).

+// The last element is the root predicate which is returned.

+VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {

+ if (Worklist.empty())

+ return nullptr;

+ // The worklist initially contains all the leaf nodes. Initialize the tree

+ // using them.

+ while (Worklist.size() >= 2) {

+ // Pop a pair of values from the front.

+ VPValue *LHS = Worklist.front();

+ Worklist.pop_front();

+ VPValue *RHS = Worklist.front();

+ Worklist.pop_front();

+ // Create an OR of these values.

+ VPValue *Or = Builder.createOr(LHS, RHS);

+ // Push OR to the back of the worklist.

+ Worklist.push_back(Or);

+ }

+ assert(Worklist.size() == 1 && "Expected 1 item in worklist");

+ // The root is the last node in the worklist.

+ VPValue *Root = Worklist.front();

+ // This root needs to replace the existing block predicate. This is done in

+ // the caller function.

+ return Root;

+// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE

+VPlanPredicator::EdgeType

+VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,

+ VPBlockBase *ToBlock) {

+ unsigned Count = 0;

+ for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {

+ if (SuccBlock == ToBlock) {

+ assert(Count < 2 && "Switch not supported currently");

+ return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;

+ }

+ Count++;

+ }

+ llvm_unreachable("Broken getEdgeTypeBetween");

+// Generate all predicates needed for CurrBlock by going through its immediate

+// predecessor blocks.

+void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,

+ VPRegionBlock *Region) {

+ // Blocks that dominate region exit inherit the predicate from the region.

+ // Return after setting the predicate.

+ if (VPDomTree.dominates(CurrBlock, Region->getExit())) {

+ VPValue *RegionBP = Region->getPredicate();

+ CurrBlock->setPredicate(RegionBP);

+ return;

+ }

+ // Collect all incoming predicates in a worklist.

+ std::list<VPValue *> IncomingPredicates;

+ // Set the builder's insertion point to the top of the current BB

+ VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());

+ Builder.setInsertPoint(CurrBB, CurrBB->begin());

+ // For each predecessor, generate the VPInstructions required for

+ // computing 'BP AND (not) CBV" at the top of CurrBB.

+ // Collect the outcome of this calculation for all predecessors

+ // into IncomingPredicates.

+ for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {

+ // Skip back-edges

+ if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))

+ continue;

+ VPValue *IncomingPredicate = nullptr;

+ unsigned NumPredSuccsNoBE =

+ VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);

+ // If there is an unconditional branch to the currBB, then we don't create

+ // edge predicates. We use the predecessor's block predicate instead.

+ if (NumPredSuccsNoBE == 1)

+ IncomingPredicate = PredBlock->getPredicate();

+ else if (NumPredSuccsNoBE == 2) {

+ // Emit recipes into CurrBlock if required

+ assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");

+ IncomingPredicate =

+ getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);

+ } else

+ llvm_unreachable("FIXME: switch statement ?");

+ if (IncomingPredicate)

+ IncomingPredicates.push_back(IncomingPredicate);

+ }

+ // Logically OR all incoming predicates by building the Predicate Tree.

+ VPValue *Predicate = genPredicateTree(IncomingPredicates);

+ // Now update the block's predicate with the new one.

+ CurrBlock->setPredicate(Predicate);

+// Generate all predicates needed for Region.

+void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {

+ VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());

+ ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);

+ // Generate edge predicates and append them to the block predicate. RPO is

+ // necessary since the predecessor blocks' block predicate needs to be set

+ // before the current block's block predicate can be computed.

+ for (VPBlockBase *Block : make_range(RPOT.begin(), RPOT.end())) {

+ // TODO: Handle nested regions once we start generating the same.

+ assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");

+ createOrPropagatePredicates(Block, Region);

+ }

+// Linearize the CFG within Region.

+// TODO: Predication and linearization need RPOT for every region.

+// This traversal is expensive. Since predication is not adding new

+// blocks, we should be able to compute RPOT once in predication and

+// reuse it here. This becomes even more important once we have nested

+// regions.

+void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {

+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());

+ VPBlockBase *PrevBlock = nullptr;

+ for (VPBlockBase *CurrBlock : make_range(RPOT.begin(), RPOT.end())) {

+ // TODO: Handle nested regions once we start generating the same.

+ assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");

+ // Linearize control flow by adding an unconditional edge between PrevBlock

+ // and CurrBlock skipping loop headers and latches to keep intact loop

+ // header predecessors and loop latch successors.

+ if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&

+ !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {

+ LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"

+ << CurrBlock->getName() << "\n");

+ PrevBlock->clearSuccessors();

+ CurrBlock->clearPredecessors();

+ VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);

+ }

+ PrevBlock = CurrBlock;

+ }

+// Entry point. The driver function for the predicator.

+void VPlanPredicator::predicate(void) {

+ // Predicate the blocks within Region.

+ predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));

+ // Linearlize the blocks with Region.

+ linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));

+VPlanPredicator::VPlanPredicator(VPlan &Plan)

+ : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {

+ // FIXME: Predicator is currently computing the dominator information for the

+ // top region. Once we start storing dominator information in a VPRegionBlock,

+ // we can avoid this recalculation.

+ VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));

diff --git a/lib/Transforms/Vectorize/VPlanPredicator.h b/lib/Transforms/Vectorize/VPlanPredicator.h
new file mode 100644
index 000000000000..692afd2978d5
--- /dev/null
+++ b/lib/Transforms/Vectorize/VPlanPredicator.h

@@ -0,0 +1,74 @@

+//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+///

+/// \file

+/// This file defines the VPlanPredicator class which contains the public

+/// interfaces to predicate and linearize the VPlan region.

+///

+//===----------------------------------------------------------------------===//

+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H

+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H

+#include "LoopVectorizationPlanner.h"

+#include "VPlan.h"

+#include "VPlanDominatorTree.h"

+namespace llvm {

+class VPlanPredicator {

+private:

+ enum class EdgeType {

+ TRUE_EDGE,

+ FALSE_EDGE,

+ };

+ // VPlan being predicated.

+ VPlan &Plan;

+ // VPLoopInfo for Plan's HCFG.

+ VPLoopInfo *VPLI;

+ // Dominator tree for Plan's HCFG.

+ VPDominatorTree VPDomTree;

+ // VPlan builder used to generate VPInstructions for block predicates.

+ VPBuilder Builder;

+ /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if

+ /// \p ToBlock is either the unconditional successor or the conditional true

+ /// successor of \p FromBlock and FALSE_EDGE otherwise.

+ EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);

+ /// Create and return VPValue corresponding to the predicate for the edge from

+ /// \p PredBB to \p CurrentBlock.

+ VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);

+ /// Generate and return the result of ORing all the predicate VPValues in \p

+ /// Worklist.

+ VPValue *genPredicateTree(std::list<VPValue *> &Worklist);

+ /// Create or propagate predicate for \p CurrBlock in region \p Region using

+ /// predicate(s) of its predecessor(s)

+ void createOrPropagatePredicates(VPBlockBase *CurrBlock,

+ VPRegionBlock *Region);

+ /// Predicate the CFG within \p Region.

+ void predicateRegionRec(VPRegionBlock *Region);

+ /// Linearize the CFG within \p Region.

+ void linearizeRegionRec(VPRegionBlock *Region);

+public:

+ VPlanPredicator(VPlan &Plan);

+ /// Predicate Plan's HCFG.

+ void predicate(void);

+};

+} // end namespace llvm

+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H

diff --git a/lib/Transforms/Vectorize/VPlanSLP.cpp b/lib/Transforms/Vectorize/VPlanSLP.cpp
index ad3a85a6f760..e5ab24e52df6 100644
--- a/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/lib/Transforms/Vectorize/VPlanSLP.cpp

@@ -1,9 +1,8 @@

//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

/// This file implements SLP analysis based on VPlan. The analysis is based on

diff --git a/lib/Transforms/Vectorize/VPlanValue.h b/lib/Transforms/Vectorize/VPlanValue.h
index b473579b699f..7b6c228c229e 100644
--- a/lib/Transforms/Vectorize/VPlanValue.h
+++ b/lib/Transforms/Vectorize/VPlanValue.h

@@ -1,9 +1,8 @@

//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanVerifier.cpp b/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 054bed4e177f..394b1b93113b 100644
--- a/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/lib/Transforms/Vectorize/VPlanVerifier.cpp

@@ -1,9 +1,8 @@

//===-- VPlanVerifier.cpp -------------------------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/VPlanVerifier.h b/lib/Transforms/Vectorize/VPlanVerifier.h
index d2f99d006a66..7d2b26252172 100644
--- a/lib/Transforms/Vectorize/VPlanVerifier.h
+++ b/lib/Transforms/Vectorize/VPlanVerifier.h

@@ -1,9 +1,8 @@

//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

///

diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 559ab1968844..6a4f9169c2af 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp

@@ -1,9 +1,8 @@

//===-- Vectorize.cpp -----------------------------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//