src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2017-04-16 16:01:22 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-04-16 16:01:22 +0000
commit	71d5a2540a98c81f5bcaeb48805e0e2881f530ef (patch)
tree	5343938942df402b49ec7300a1c25a2d4ccd5821 /lib/Transforms/Vectorize
parent	31bbf64f3a4974a2d6c8b3b27ad2f519caf74057 (diff)

vendor/llvm/llvm-trunk-r300422

Diffstat (limited to 'lib/Transforms/Vectorize')

-rw-r--r--

lib/Transforms/Vectorize/BBVectorize.cpp

-rw-r--r--

lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

-rw-r--r--

lib/Transforms/Vectorize/LoopVectorize.cpp

2965

-rw-r--r--

lib/Transforms/Vectorize/SLPVectorizer.cpp

846

4 files changed, 2258 insertions, 1643 deletions

diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index c01740b27d59..c83b3f7b225b 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp

@@ -494,13 +494,13 @@ namespace {

if (StoreInst *SI = dyn_cast<StoreInst>(I)) {

// For stores, it is the value type, not the pointer type that matters

// because the value is what will come from a vector register.

Value *IVal = SI->getValueOperand();

T1 = IVal->getType();

} else {

T1 = I->getType();

}

if (CastInst *CI = dyn_cast<CastInst>(I))

T2 = CI->getSrcTy();

else

@@ -547,10 +547,11 @@ namespace {

// Returns the cost of the provided instruction using TTI.

// This does not handle loads and stores.

unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,

- TargetTransformInfo::OperandValueKind Op1VK =

+ TargetTransformInfo::OperandValueKind Op1VK =

TargetTransformInfo::OK_AnyValue,

TargetTransformInfo::OperandValueKind Op2VK =

- TargetTransformInfo::OK_AnyValue) {

+ TargetTransformInfo::OK_AnyValue,

+ const Instruction *I = nullptr) {

switch (Opcode) {

default: break;

case Instruction::GetElementPtr:

@@ -584,7 +585,7 @@ namespace {

case Instruction::Select:

case Instruction::ICmp:

case Instruction::FCmp:

- return TTI->getCmpSelInstrCost(Opcode, T1, T2);

+ return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);

case Instruction::ZExt:

case Instruction::SExt:

case Instruction::FPToUI:

@@ -598,7 +599,7 @@ namespace {

case Instruction::FPTrunc:

case Instruction::BitCast:

case Instruction::ShuffleVector:

- return TTI->getCastInstrCost(Opcode, T1, T2);

+ return TTI->getCastInstrCost(Opcode, T1, T2, I);

}

return 1;

@@ -894,7 +895,7 @@ namespace {

// vectors that has a scalar condition results in a malformed select.

// FIXME: We could probably be smarter about this by rewriting the select

// with different types instead.

- return (SI->getCondition()->getType()->isVectorTy() ==

+ return (SI->getCondition()->getType()->isVectorTy() ==

SI->getTrueValue()->getType()->isVectorTy());

} else if (isa<CmpInst>(I)) {

if (!Config.VectorizeCmp)

@@ -1044,14 +1045,14 @@ namespace {

return false;

}

} else if (TTI) {

- unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);

- unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);

- Type *VT1 = getVecTypeForPair(IT1, JT1),

- *VT2 = getVecTypeForPair(IT2, JT2);

TargetTransformInfo::OperandValueKind Op1VK =

TargetTransformInfo::OK_AnyValue;

TargetTransformInfo::OperandValueKind Op2VK =

TargetTransformInfo::OK_AnyValue;

+ unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);

+ unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);

+ Type *VT1 = getVecTypeForPair(IT1, JT1),

+ *VT2 = getVecTypeForPair(IT2, JT2);

// On some targets (example X86) the cost of a vector shift may vary

// depending on whether the second operand is a Uniform or

@@ -1090,7 +1091,7 @@ namespace {

// but this cost is ignored (because insert and extract element

// instructions are assigned a zero depth factor and are not really

// fused in general).

- unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);

+ unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);

if (VCost > ICost + JCost)

return false;

@@ -1127,39 +1128,51 @@ namespace {

FastMathFlags FMFCI;

if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))

FMFCI = FPMOCI->getFastMathFlags();

+ SmallVector<Value *, 4> IArgs(CI->arg_operands());

+ unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);

- SmallVector<Type*, 4> Tys;

- for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)

- Tys.push_back(CI->getArgOperand(i)->getType());

- unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);

- Tys.clear();

CallInst *CJ = cast<CallInst>(J);

FastMathFlags FMFCJ;

if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))

FMFCJ = FPMOCJ->getFastMathFlags();

- for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)

- Tys.push_back(CJ->getArgOperand(i)->getType());

- unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);

+ SmallVector<Value *, 4> JArgs(CJ->arg_operands());

+ unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);

- Tys.clear();

assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&

"Intrinsic argument counts differ");

+ SmallVector<Type*, 4> Tys;

+ SmallVector<Value *, 4> VecArgs;

for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {

if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||

- IID == Intrinsic::cttz) && i == 1)

+ IID == Intrinsic::cttz) && i == 1) {

Tys.push_back(CI->getArgOperand(i)->getType());

- else

+ VecArgs.push_back(CI->getArgOperand(i));

+ }

+ else {

Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),

CJ->getArgOperand(i)->getType()));

+ // Add both operands, and then count their scalarization overhead

+ // with VF 1.

+ VecArgs.push_back(CI->getArgOperand(i));

+ VecArgs.push_back(CJ->getArgOperand(i));

+ }

}

+ // Compute the scalarization cost here with the original operands (to

+ // check for uniqueness etc), and then call getIntrinsicInstrCost()

+ // with the constructed vector types.

+ Type *RetTy = getVecTypeForPair(IT1, JT1);

+ unsigned ScalarizationCost = 0;

+ if (!RetTy->isVoidTy())

+ ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);

+ ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);

FastMathFlags FMFV = FMFCI;

FMFV &= FMFCJ;

- Type *RetTy = getVecTypeForPair(IT1, JT1);

- unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);

+ unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,

+ ScalarizationCost);

if (VCost > ICost + JCost)

return false;

@@ -2502,7 +2515,7 @@ namespace {

if (I2 == I1 || isa<UndefValue>(I2))

I2 = nullptr;

}

if (HEE) {

Value *I3 = HEE->getOperand(0);

if (!I2 && I3 != I1)

@@ -2693,14 +2706,14 @@ namespace {

// so extend the smaller vector to be the same length as the larger one.

Instruction *NLOp;

if (numElemL > 1) {

std::vector<Constant *> Mask(numElemH);

unsigned v = 0;

for (; v < numElemL; ++v)

Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);

for (; v < numElemH; ++v)

Mask[v] = UndefValue::get(Type::getInt32Ty(Context));

NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),

ConstantVector::get(Mask),

getReplacementName(IBeforeJ ? I : J,

@@ -2710,7 +2723,7 @@ namespace {

getReplacementName(IBeforeJ ? I : J,

true, o, 1));

}

NLOp->insertBefore(IBeforeJ ? J : I);

LOp = NLOp;

}

@@ -2720,7 +2733,7 @@ namespace {

if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,

ArgTypeH, VArgType, IBeforeJ)) {

Instruction *S =

- InsertElementInst::Create(LOp, HOp,

+ InsertElementInst::Create(LOp, HOp,

ConstantInt::get(Type::getInt32Ty(Context),

numElemL),

getReplacementName(IBeforeJ ? I : J,

@@ -2737,7 +2750,7 @@ namespace {

Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);

for (; v < numElemL; ++v)

Mask[v] = UndefValue::get(Type::getInt32Ty(Context));

NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),

ConstantVector::get(Mask),

getReplacementName(IBeforeJ ? I : J,

diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c44a393cf846..4409d7a404f8 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

@@ -432,9 +432,12 @@ Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,

unsigned ElementSizeBytes = ElementSizeBits / 8;

unsigned SizeBytes = ElementSizeBytes * Chain.size();

unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;

- if (NumLeft == Chain.size())

- --NumLeft;

- else if (NumLeft == 0)

+ if (NumLeft == Chain.size()) {

+ if ((NumLeft & 1) == 0)

+ NumLeft /= 2; // Split even in half

+ else

+ --NumLeft; // Split off last element

+ } else if (NumLeft == 0)

NumLeft = 1;

return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));

}

@@ -588,7 +591,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {

continue;

// Make sure all the users of a vector are constant-index extracts.

- if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) {

+ if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) {

const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);

return EEI && isa<ConstantInt>(EEI->getOperand(1));

}))

@@ -622,7 +625,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {

if (TySize > VecRegSize / 2)

continue;

- if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) {

+ if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) {

const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);

return EEI && isa<ConstantInt>(EEI->getOperand(1));

}))

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index dac7032fa08f..595b2ec88943 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -50,6 +50,7 @@

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/Hashing.h"

#include "llvm/ADT/MapVector.h"

+#include "llvm/ADT/Optional.h"

#include "llvm/ADT/SCCIterator.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/SmallPtrSet.h"

@@ -92,6 +93,7 @@

#include "llvm/Transforms/Scalar.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/Local.h"

+#include "llvm/Transforms/Utils/LoopSimplify.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Utils/LoopVersioning.h"

#include "llvm/Transforms/Vectorize.h"

@@ -266,21 +268,6 @@ static bool hasCyclesInLoopBody(const Loop &L) {

return false;

}

-/// \brief This modifies LoopAccessReport to initialize message with

-/// loop-vectorizer-specific part.

-class VectorizationReport : public LoopAccessReport {

-public:

- VectorizationReport(Instruction *I = nullptr)

- : LoopAccessReport("loop not vectorized: ", I) {}

- /// \brief This allows promotion of the loop-access analysis report into the

- /// loop-vectorizer report. It modifies the message to add the

- /// loop-vectorizer-specific part of the message.

- explicit VectorizationReport(const LoopAccessReport &R)

- : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),

- R.getInstr()) {}

-};

/// A helper function for converting Scalar types to vector types.

/// If the incoming type is void, we return void. If the VF is 1, we return

/// the scalar type.

@@ -290,31 +277,9 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {

return VectorType::get(Scalar, VF);

}

-/// A helper function that returns GEP instruction and knows to skip a

-/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination

-/// pointee types of the 'bitcast' have the same size.

-/// For example:

-/// bitcast double** %var to i64* - can be skipped

-/// bitcast double** %var to i8* - can not

-static GetElementPtrInst *getGEPInstruction(Value *Ptr) {

- if (isa<GetElementPtrInst>(Ptr))

- return cast<GetElementPtrInst>(Ptr);

- if (isa<BitCastInst>(Ptr) &&

- isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {

- Type *BitcastTy = Ptr->getType();

- Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();

- if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))

- return nullptr;

- Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();

- Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();

- const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();

- if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))

- return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));

- }

- return nullptr;

+// FIXME: The following helper functions have multiple implementations

+// in the project. They can be effectively organized in a common Load/Store

+// utilities unit.

/// A helper function that returns the pointer operand of a load or store

/// instruction.

@@ -326,6 +291,34 @@ static Value *getPointerOperand(Value *I) {

return nullptr;

}

+/// A helper function that returns the type of loaded or stored value.

+static Type *getMemInstValueType(Value *I) {

+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&

+ "Expected Load or Store instruction");

+ if (auto *LI = dyn_cast<LoadInst>(I))

+ return LI->getType();

+ return cast<StoreInst>(I)->getValueOperand()->getType();

+/// A helper function that returns the alignment of load or store instruction.

+static unsigned getMemInstAlignment(Value *I) {

+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&

+ "Expected Load or Store instruction");

+ if (auto *LI = dyn_cast<LoadInst>(I))

+ return LI->getAlignment();

+ return cast<StoreInst>(I)->getAlignment();

+/// A helper function that returns the address space of the pointer operand of

+/// load or store instruction.

+static unsigned getMemInstAddressSpace(Value *I) {

+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&

+ "Expected Load or Store instruction");

+ if (auto *LI = dyn_cast<LoadInst>(I))

+ return LI->getPointerAddressSpace();

+ return cast<StoreInst>(I)->getPointerAddressSpace();

/// A helper function that returns true if the given type is irregular. The

/// type is irregular if its allocated size doesn't equal the store size of an

/// element of the corresponding vector type at the given vectorization factor.

@@ -351,6 +344,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {

/// we always assume predicated blocks have a 50% chance of executing.

static unsigned getReciprocalPredBlockProb() { return 2; }

+/// A helper function that adds a 'fast' flag to floating-point operations.

+static Value *addFastMathFlag(Value *V) {

+ if (isa<FPMathOperator>(V)) {

+ FastMathFlags Flags;

+ Flags.setUnsafeAlgebra();

+ cast<Instruction>(V)->setFastMathFlags(Flags);

+ }

+ return V;

+/// A helper function that returns an integer or floating-point constant with

+/// value C.

+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {

+ return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)

+ : ConstantFP::get(Ty, C);

/// InnerLoopVectorizer vectorizes loops which contain only one basic

/// block to a specified vectorization factor (VF).

/// This class performs the widening of scalars into vectors, or multiple

@@ -428,10 +438,17 @@ protected:

/// Copy and widen the instructions from the old loop.

virtual void vectorizeLoop();

+ /// Handle all cross-iteration phis in the header.

+ void fixCrossIterationPHIs();

/// Fix a first-order recurrence. This is the second phase of vectorizing

/// this phi node.

void fixFirstOrderRecurrence(PHINode *Phi);

+ /// Fix a reduction cross-iteration phi. This is the second phase of

+ /// vectorizing this phi node.

+ void fixReduction(PHINode *Phi);

/// \brief The Loop exit block may have single value PHI nodes where the

/// incoming value is 'Undef'. While vectorizing we only handled real values

/// that were defined inside the loop. Here we fix the 'undef case'.

@@ -448,7 +465,8 @@ protected:

/// Collect the instructions from the original loop that would be trivially

/// dead in the vectorized loop if generated.

- void collectTriviallyDeadInstructions();

+ void collectTriviallyDeadInstructions(

+ SmallPtrSetImpl<Instruction *> &DeadInstructions);

/// Shrinks vector element sizes to the smallest bitwidth they can be legally

/// represented as.

@@ -462,14 +480,14 @@ protected:

/// and DST.

VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);

- /// A helper function to vectorize a single BB within the innermost loop.

- void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);

+ /// A helper function to vectorize a single instruction within the innermost

+ /// loop.

+ void vectorizeInstruction(Instruction &I);

/// Vectorize a single PHINode in a block. This method handles the induction

/// variable canonicalization. It supports both VF = 1 for unrolled loops and

/// arbitrary length vectors.

- void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF,

- PhiVector *PV);

+ void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);

/// Insert the new loop to the loop hierarchy and pass manager

/// and update the analysis passes.

@@ -504,20 +522,21 @@ protected:

/// \p EntryVal is the value from the original loop that maps to the steps.

/// Note that \p EntryVal doesn't have to be an induction variable (e.g., it

/// can be a truncate instruction).

- void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);

- /// Create a vector induction phi node based on an existing scalar one. This

- /// currently only works for integer induction variables with a constant

- /// step. \p EntryVal is the value from the original loop that maps to the

- /// vector phi node. If \p EntryVal is a truncate instruction, instead of

- /// widening the original IV, we widen a version of the IV truncated to \p

- /// EntryVal's type.

- void createVectorIntInductionPHI(const InductionDescriptor &II,

- Instruction *EntryVal);

- /// Widen an integer induction variable \p IV. If \p Trunc is provided, the

- /// induction variable will first be truncated to the corresponding type.

- void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);

+ void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,

+ const InductionDescriptor &ID);

+ /// Create a vector induction phi node based on an existing scalar one. \p

+ /// EntryVal is the value from the original loop that maps to the vector phi

+ /// node, and \p Step is the loop-invariant step. If \p EntryVal is a

+ /// truncate instruction, instead of widening the original IV, we widen a

+ /// version of the IV truncated to \p EntryVal's type.

+ void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,

+ Value *Step, Instruction *EntryVal);

+ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc

+ /// is provided, the integer induction variable will first be truncated to

+ /// the corresponding type.

+ void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);

/// Returns true if an instruction \p I should be scalarized instead of

/// vectorized for the chosen vectorization factor.

@@ -583,6 +602,10 @@ protected:

/// vector of instructions.

void addMetadata(ArrayRef<Value *> To, Instruction *From);

+ /// \brief Set the debug location in the builder using the debug location in

+ /// the instruction.

+ void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);

/// This is a helper class for maintaining vectorization state. It's used for

/// mapping values from the original loop to their corresponding values in

/// the new loop. Two mappings are maintained: one for vectorized values and

@@ -777,14 +800,6 @@ protected:

// Record whether runtime checks are added.

bool AddedSafetyChecks;

- // Holds instructions from the original loop whose counterparts in the

- // vectorized loop would be trivially dead if generated. For example,

- // original induction update instructions can become dead because we

- // separately emit induction "steps" when generating code for the new loop.

- // Similarly, we create a new latch condition when setting up the structure

- // of the new loop, so the old one can become dead.

- SmallPtrSet<Instruction *, 4> DeadInstructions;

// Holds the end values for each induction variable. We save the end values

// so we can later fix-up the external users of the induction variables.

DenseMap<PHINode *, Value *> IVEndValues;

@@ -803,8 +818,6 @@ public:

UnrollFactor, LVL, CM) {}

private:

- void scalarizeInstruction(Instruction *Instr,

- bool IfPredicateInstr = false) override;

void vectorizeMemoryInstruction(Instruction *Instr) override;

Value *getBroadcastInstrs(Value *V) override;

Value *getStepVector(Value *Val, int StartIdx, Value *Step,

@@ -832,12 +845,14 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {

return I;

}

-/// \brief Set the debug location in the builder using the debug location in the

-/// instruction.

-static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {

- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))

- B.SetCurrentDebugLocation(Inst->getDebugLoc());

- else

+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {

+ if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {

+ const DILocation *DIL = Inst->getDebugLoc();

+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling())

+ B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));

+ else

+ B.SetCurrentDebugLocation(DIL);

+ } else

B.SetCurrentDebugLocation(DebugLoc());

}

@@ -1497,14 +1512,6 @@ private:

OptimizationRemarkEmitter &ORE;

};

-static void emitAnalysisDiag(const Loop *TheLoop,

- const LoopVectorizeHints &Hints,

- OptimizationRemarkEmitter &ORE,

- const LoopAccessReport &Message) {

- const char *Name = Hints.vectorizeAnalysisPassName();

- LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);

static void emitMissedWarning(Function *F, Loop *L,

const LoopVectorizeHints &LH,

OptimizationRemarkEmitter *ORE) {

@@ -1512,13 +1519,17 @@ static void emitMissedWarning(Function *F, Loop *L,

if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {

if (LH.getWidth() != 1)

- emitLoopVectorizeWarning(

- F->getContext(), *F, L->getStartLoc(),

- "failed explicitly specified loop vectorization");

+ ORE->emit(DiagnosticInfoOptimizationFailure(

+ DEBUG_TYPE, "FailedRequestedVectorization",

+ L->getStartLoc(), L->getHeader())

+ << "loop not vectorized: "

+ << "failed explicitly specified loop vectorization");

else if (LH.getInterleave() != 1)

- emitLoopInterleaveWarning(

- F->getContext(), *F, L->getStartLoc(),

- "failed explicitly specified loop interleaving");

+ ORE->emit(DiagnosticInfoOptimizationFailure(

+ DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),

+ L->getHeader())

+ << "loop not interleaved: "

+ << "failed explicitly specified loop interleaving");

}

@@ -1546,7 +1557,7 @@ public:

LoopVectorizeHints *H)

: NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),

GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),

- Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),

+ PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),

Requirements(R), Hints(H) {}

/// ReductionList contains the reduction descriptors for all

@@ -1566,8 +1577,8 @@ public:

/// loop, only that it is legal to do so.

bool canVectorize();

- /// Returns the Induction variable.

- PHINode *getInduction() { return Induction; }

+ /// Returns the primary induction variable.

+ PHINode *getPrimaryInduction() { return PrimaryInduction; }

/// Returns the reduction variables found in the loop.

ReductionList *getReductionVars() { return &Reductions; }

@@ -1607,12 +1618,6 @@ public:

/// Returns true if the value V is uniform within the loop.

bool isUniform(Value *V);

- /// Returns true if \p I is known to be uniform after vectorization.

- bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }

- /// Returns true if \p I is known to be scalar after vectorization.

- bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }

/// Returns the information that we collected about runtime memory check.

const RuntimePointerChecking *getRuntimePointerChecking() const {

return LAI->getRuntimePointerChecking();

@@ -1689,15 +1694,9 @@ public:

/// instructions that may divide by zero.

bool isScalarWithPredication(Instruction *I);

- /// Returns true if \p I is a memory instruction that has a consecutive or

- /// consecutive-like pointer operand. Consecutive-like pointers are pointers

- /// that are treated like consecutive pointers during vectorization. The

- /// pointer operands of interleaved accesses are an example.

- bool hasConsecutiveLikePtrOperand(Instruction *I);

- /// Returns true if \p I is a memory instruction that must be scalarized

- /// during vectorization.

- bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);

+ /// Returns true if \p I is a memory instruction with consecutive memory

+ /// access that can be widened.

+ bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);

private:

/// Check if a single basic block loop is vectorizable.

@@ -1715,24 +1714,6 @@ private:

/// transformation.

bool canVectorizeWithIfConvert();

- /// Collect the instructions that are uniform after vectorization. An

- /// instruction is uniform if we represent it with a single scalar value in

- /// the vectorized loop corresponding to each vector iteration. Examples of

- /// uniform instructions include pointer operands of consecutive or

- /// interleaved memory accesses. Note that although uniformity implies an

- /// instruction will be scalar, the reverse is not true. In general, a

- /// scalarized instruction will be represented by VF scalar values in the

- /// vectorized loop, each corresponding to an iteration of the original

- /// scalar loop.

- void collectLoopUniforms();

- /// Collect the instructions that are scalar after vectorization. An

- /// instruction is scalar if it is known to be uniform or will be scalarized

- /// during vectorization. Non-uniform scalarized instructions will be

- /// represented by VF values in the vectorized loop, each corresponding to an

- /// iteration of the original scalar loop.

- void collectLoopScalars();

/// Return true if all of the instructions in the block can be speculatively

/// executed. \p SafePtrs is a list of addresses that are known to be legal

/// and we know that we can read from them without segfault.

@@ -1744,14 +1725,6 @@ private:

void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,

SmallPtrSetImpl<Value *> &AllowedExit);

- /// Report an analysis message to assist the user in diagnosing loops that are

- /// not vectorized. These are handled as LoopAccessReport rather than

- /// VectorizationReport because the << operator of VectorizationReport returns

- /// LoopAccessReport.

- void emitAnalysis(const LoopAccessReport &Message) const {

- emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);

- }

/// Create an analysis remark that explains why vectorization failed

///

/// \p RemarkName is the identifier for the remark. If \p I is passed it is

@@ -1804,9 +1777,9 @@ private:

// --- vectorization state --- //

- /// Holds the integer induction variable. This is the counter of the

+ /// Holds the primary induction variable. This is the counter of the

/// loop.

- PHINode *Induction;

+ PHINode *PrimaryInduction;

/// Holds the reduction variables.

ReductionList Reductions;

/// Holds all of the induction variables that we found in the loop.

@@ -1822,12 +1795,6 @@ private:

/// vars which can be accessed from outside the loop.

SmallPtrSet<Value *, 4> AllowedExit;

- /// Holds the instructions known to be uniform after vectorization.

- SmallPtrSet<Instruction *, 4> Uniforms;

- /// Holds the instructions known to be scalar after vectorization.

- SmallPtrSet<Instruction *, 4> Scalars;

/// Can we assume the absence of NaNs.

bool HasFunNoNaNAttr;

@@ -1861,16 +1828,26 @@ public:

: TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),

AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}

+ /// \return An upper bound for the vectorization factor, or None if

+ /// vectorization should be avoided up front.

+ Optional<unsigned> computeMaxVF(bool OptForSize);

/// Information about vectorization costs

struct VectorizationFactor {

unsigned Width; // Vector width with best cost

unsigned Cost; // Cost of the loop with that width

};

/// \return The most profitable vectorization factor and the cost of that VF.

- /// This method checks every power of two up to VF. If UserVF is not ZERO

+ /// This method checks every power of two up to MaxVF. If UserVF is not ZERO

/// then this vectorization factor will be selected if vectorization is

/// possible.

- VectorizationFactor selectVectorizationFactor(bool OptForSize);

+ VectorizationFactor selectVectorizationFactor(unsigned MaxVF);

+ /// Setup cost-based decisions for user vectorization factor.

+ void selectUserVectorizationFactor(unsigned UserVF) {

+ collectUniformsAndScalars(UserVF);

+ collectInstsToScalarize(UserVF);

+ }

/// \return The size (in bits) of the smallest and widest types in the code

/// that needs to be vectorized. We ignore values that remain scalar such as

@@ -1884,6 +1861,15 @@ public:

unsigned selectInterleaveCount(bool OptForSize, unsigned VF,

unsigned LoopCost);

+ /// Memory access instruction may be vectorized in more than one way.

+ /// Form of instruction after vectorization depends on cost.

+ /// This function takes cost-based decisions for Load/Store instructions

+ /// and collects them in a map. This decisions map is used for building

+ /// the lists of loop-uniform and loop-scalar instructions.

+ /// The calculated cost is saved with widening decision in order to

+ /// avoid redundant calculations.

+ void setCostBasedWideningDecision(unsigned VF);

/// \brief A struct that represents some properties of the register usage

/// of a loop.

struct RegisterUsage {

@@ -1918,14 +1904,118 @@ public:

return Scalars->second.count(I);

}

+ /// Returns true if \p I is known to be uniform after vectorization.

+ bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {

+ if (VF == 1)

+ return true;

+ assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");

+ auto UniformsPerVF = Uniforms.find(VF);

+ return UniformsPerVF->second.count(I);

+ }

+ /// Returns true if \p I is known to be scalar after vectorization.

+ bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {

+ if (VF == 1)

+ return true;

+ assert(Scalars.count(VF) && "Scalar values are not calculated for VF");

+ auto ScalarsPerVF = Scalars.find(VF);

+ return ScalarsPerVF->second.count(I);

+ }

/// \returns True if instruction \p I can be truncated to a smaller bitwidth

/// for vectorization factor \p VF.

bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {

return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&

- !Legal->isScalarAfterVectorization(I);

+ !isScalarAfterVectorization(I, VF);

+ }

+ /// Decision that was taken during cost calculation for memory instruction.

+ enum InstWidening {

+ CM_Unknown,

+ CM_Widen,

+ CM_Interleave,

+ CM_GatherScatter,

+ CM_Scalarize

+ };

+ /// Save vectorization decision \p W and \p Cost taken by the cost model for

+ /// instruction \p I and vector width \p VF.

+ void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,

+ unsigned Cost) {

+ assert(VF >= 2 && "Expected VF >=2");

+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);

+ }

+ /// Save vectorization decision \p W and \p Cost taken by the cost model for

+ /// interleaving group \p Grp and vector width \p VF.

+ void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,

+ InstWidening W, unsigned Cost) {

+ assert(VF >= 2 && "Expected VF >=2");

+ /// Broadcast this decicion to all instructions inside the group.

+ /// But the cost will be assigned to one instruction only.

+ for (unsigned i = 0; i < Grp->getFactor(); ++i) {

+ if (auto *I = Grp->getMember(i)) {

+ if (Grp->getInsertPos() == I)

+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);

+ else

+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);

+ }

+ /// Return the cost model decision for the given instruction \p I and vector

+ /// width \p VF. Return CM_Unknown if this instruction did not pass

+ /// through the cost modeling.

+ InstWidening getWideningDecision(Instruction *I, unsigned VF) {

+ assert(VF >= 2 && "Expected VF >=2");

+ std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);

+ auto Itr = WideningDecisions.find(InstOnVF);

+ if (Itr == WideningDecisions.end())

+ return CM_Unknown;

+ return Itr->second.first;

+ }

+ /// Return the vectorization cost for the given instruction \p I and vector

+ /// width \p VF.

+ unsigned getWideningCost(Instruction *I, unsigned VF) {

+ assert(VF >= 2 && "Expected VF >=2");

+ std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);

+ assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");

+ return WideningDecisions[InstOnVF].second;

+ }

+ /// Return True if instruction \p I is an optimizable truncate whose operand

+ /// is an induction variable. Such a truncate will be removed by adding a new

+ /// induction variable with the destination type.

+ bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {

+ // If the instruction is not a truncate, return false.

+ auto *Trunc = dyn_cast<TruncInst>(I);

+ if (!Trunc)

+ return false;

+ // Get the source and destination types of the truncate.

+ Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);

+ Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);

+ // If the truncate is free for the given types, return false. Replacing a

+ // free truncate with an induction variable would add an induction variable

+ // update instruction to each iteration of the loop. We exclude from this

+ // check the primary induction variable since it will need an update

+ // instruction regardless.

+ Value *Op = Trunc->getOperand(0);

+ if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))

+ return false;

+ // If the truncated value is not an induction variable, return false.

+ return Legal->isInductionVariable(Op);

}

private:

+ /// \return An upper bound for the vectorization factor, larger than zero.

+ /// One is returned if vectorization should best be avoided due to cost.

+ unsigned computeFeasibleMaxVF(bool OptForSize);

/// The vectorization cost is a combination of the cost itself and a boolean

/// indicating whether any of the contributing operations will actually

/// operate on

@@ -1949,6 +2039,26 @@ private:

/// the vector type as an output parameter.

unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);

+ /// Calculate vectorization cost of memory instruction \p I.

+ unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);

+ /// The cost computation for scalarized memory instruction.

+ unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);

+ /// The cost computation for interleaving group of memory instructions.

+ unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);

+ /// The cost computation for Gather/Scatter instruction.

+ unsigned getGatherScatterCost(Instruction *I, unsigned VF);

+ /// The cost computation for widening instruction \p I with consecutive

+ /// memory access.

+ unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);

+ /// The cost calculation for Load instruction \p I with uniform pointer -

+ /// scalar load + broadcast.

+ unsigned getUniformMemOpCost(Instruction *I, unsigned VF);

/// Returns whether the instruction is a load or store and will be a emitted

/// as a vector operation.

bool isConsecutiveLoadOrStore(Instruction *I);

@@ -1972,12 +2082,24 @@ private:

/// pairs.

typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;

+ /// A set containing all BasicBlocks that are known to present after

+ /// vectorization as a predicated block.

+ SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

/// A map holding scalar costs for different vectorization factors. The

/// presence of a cost for an instruction in the mapping indicates that the

/// instruction will be scalarized when vectorizing with the associated

/// vectorization factor. The entries are VF-ScalarCostTy pairs.

DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;

+ /// Holds the instructions known to be uniform after vectorization.

+ /// The data is collected per VF.

+ DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;

+ /// Holds the instructions known to be scalar after vectorization.

+ /// The data is collected per VF.

+ DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;

/// Returns the expected difference in cost from scalarizing the expression

/// feeding a predicated instruction \p PredInst. The instructions to

/// scalarize and their scalar costs are collected in \p ScalarCosts. A

@@ -1990,6 +2112,44 @@ private:

/// the loop.

void collectInstsToScalarize(unsigned VF);

+ /// Collect the instructions that are uniform after vectorization. An

+ /// instruction is uniform if we represent it with a single scalar value in

+ /// the vectorized loop corresponding to each vector iteration. Examples of

+ /// uniform instructions include pointer operands of consecutive or

+ /// interleaved memory accesses. Note that although uniformity implies an

+ /// instruction will be scalar, the reverse is not true. In general, a

+ /// scalarized instruction will be represented by VF scalar values in the

+ /// vectorized loop, each corresponding to an iteration of the original

+ /// scalar loop.

+ void collectLoopUniforms(unsigned VF);

+ /// Collect the instructions that are scalar after vectorization. An

+ /// instruction is scalar if it is known to be uniform or will be scalarized

+ /// during vectorization. Non-uniform scalarized instructions will be

+ /// represented by VF values in the vectorized loop, each corresponding to an

+ /// iteration of the original scalar loop.

+ void collectLoopScalars(unsigned VF);

+ /// Collect Uniform and Scalar values for the given \p VF.

+ /// The sets depend on CM decision for Load/Store instructions

+ /// that may be vectorized as interleave, gather-scatter or scalarized.

+ void collectUniformsAndScalars(unsigned VF) {

+ // Do the analysis once.

+ if (VF == 1 || Uniforms.count(VF))

+ return;

+ setCostBasedWideningDecision(VF);

+ collectLoopUniforms(VF);

+ collectLoopScalars(VF);

+ }

+ /// Keeps cost model vectorization decision and cost for instructions.

+ /// Right now it is used for memory instructions only.

+ typedef DenseMap<std::pair<Instruction *, unsigned>,

+ std::pair<InstWidening, unsigned>>

+ DecisionList;

+ DecisionList WideningDecisions;

public:

/// The loop that we evaluate.

Loop *TheLoop;

@@ -2019,6 +2179,23 @@ public:

SmallPtrSet<const Value *, 16> VecValuesToIgnore;

};

+/// LoopVectorizationPlanner - drives the vectorization process after having

+/// passed Legality checks.

+class LoopVectorizationPlanner {

+public:

+ LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}

+ ~LoopVectorizationPlanner() {}

+ /// Plan how to best vectorize, return the best VF and its cost.

+ LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,

+ unsigned UserVF);

+private:

+ /// The profitablity analysis.

+ LoopVectorizationCostModel &CM;

+};

/// \brief This holds vectorization requirements that must be verified late in

/// the process. The requirements are set by legalize and costmodel. Once

/// vectorization has been determined to be possible and profitable the

@@ -2134,8 +2311,6 @@ struct LoopVectorize : public FunctionPass {

void getAnalysisUsage(AnalysisUsage &AU) const override {

AU.addRequired<AssumptionCacheTracker>();

- AU.addRequiredID(LoopSimplifyID);

- AU.addRequiredID(LCSSAID);

AU.addRequired<BlockFrequencyInfoWrapperPass>();

AU.addRequired<DominatorTreeWrapperPass>();

AU.addRequired<LoopInfoWrapperPass>();

@@ -2156,7 +2331,7 @@ struct LoopVectorize : public FunctionPass {

//===----------------------------------------------------------------------===//

// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and

-// LoopVectorizationCostModel.

+// LoopVectorizationCostModel and LoopVectorizationPlanner.

//===----------------------------------------------------------------------===//

Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {

@@ -2176,27 +2351,51 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {

return Shuf;

}

-void InnerLoopVectorizer::createVectorIntInductionPHI(

- const InductionDescriptor &II, Instruction *EntryVal) {

+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(

+ const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {

Value *Start = II.getStartValue();

- ConstantInt *Step = II.getConstIntStepValue();

- assert(Step && "Can not widen an IV with a non-constant step");

// Construct the initial value of the vector IV in the vector loop preheader

auto CurrIP = Builder.saveIP();

Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

if (isa<TruncInst>(EntryVal)) {

+ assert(Start->getType()->isIntegerTy() &&

+ "Truncation requires an integer type");

auto *TruncType = cast<IntegerType>(EntryVal->getType());

- Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());

+ Step = Builder.CreateTrunc(Step, TruncType);

Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);

}

Value *SplatStart = Builder.CreateVectorSplat(VF, Start);

- Value *SteppedStart = getStepVector(SplatStart, 0, Step);

+ Value *SteppedStart =

+ getStepVector(SplatStart, 0, Step, II.getInductionOpcode());

+ // We create vector phi nodes for both integer and floating-point induction

+ // variables. Here, we determine the kind of arithmetic we will perform.

+ Instruction::BinaryOps AddOp;

+ Instruction::BinaryOps MulOp;

+ if (Step->getType()->isIntegerTy()) {

+ AddOp = Instruction::Add;

+ MulOp = Instruction::Mul;

+ } else {

+ AddOp = II.getInductionOpcode();

+ MulOp = Instruction::FMul;

+ }

+ // Multiply the vectorization factor by the step using integer or

+ // floating-point arithmetic as appropriate.

+ Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);

+ Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));

+ // Create a vector splat to use in the induction update.

+ //

+ // FIXME: If the step is non-constant, we create the vector splat with

+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't

+ // handle a constant vector splat.

+ Value *SplatVF = isa<Constant>(Mul)

+ ? ConstantVector::getSplat(VF, cast<Constant>(Mul))

+ : Builder.CreateVectorSplat(VF, Mul);

Builder.restoreIP(CurrIP);

- Value *SplatVF =

- ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(),

- VF * Step->getSExtValue()));

// We may need to add the step a number of times, depending on the unroll

// factor. The last of those goes into the PHI.

PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",

@@ -2205,8 +2404,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(

VectorParts Entry(UF);

for (unsigned Part = 0; Part < UF; ++Part) {

Entry[Part] = LastInduction;

- LastInduction = cast<Instruction>(

- Builder.CreateAdd(LastInduction, SplatVF, "step.add"));

+ LastInduction = cast<Instruction>(addFastMathFlag(

+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));

}

VectorLoopValueMap.initVector(EntryVal, Entry);

if (isa<TruncInst>(EntryVal))

@@ -2225,7 +2424,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(

}

bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {

- return Legal->isScalarAfterVectorization(I) ||

+ return Cost->isScalarAfterVectorization(I, VF) ||

Cost->isProfitableToScalarize(I, VF);

}

@@ -2239,7 +2438,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {

return any_of(IV->users(), isScalarInst);

}

-void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {

+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {

+ assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&

+ "Primary induction variable must have an integer type");

auto II = Legal->getInductionVars()->find(IV);

assert(II != Legal->getInductionVars()->end() && "IV is not an induction");

@@ -2251,9 +2453,6 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {

// induction variable.

Value *ScalarIV = nullptr;

- // The step of the induction.

- Value *Step = nullptr;

// The value from the original loop to which we are mapping the new induction

// variable.

Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

@@ -2266,45 +2465,49 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {

// least one user in the loop that is not widened.

auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);

- // If the induction variable has a constant integer step value, go ahead and

- // get it now.

- if (ID.getConstIntStepValue())

- Step = ID.getConstIntStepValue();

+ // Generate code for the induction step. Note that induction steps are

+ // required to be loop-invariant

+ assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&

+ "Induction step should be loop invariant");

+ auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

+ Value *Step = nullptr;

+ if (PSE.getSE()->isSCEVable(IV->getType())) {

+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");

+ Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),

+ LoopVectorPreHeader->getTerminator());

+ } else {

+ Step = cast<SCEVUnknown>(ID.getStep())->getValue();

+ }

// Try to create a new independent vector induction variable. If we can't

// create the phi node, we will splat the scalar induction variable in each

// loop iteration.

- if (VF > 1 && IV->getType() == Induction->getType() && Step &&

- !shouldScalarizeInstruction(EntryVal)) {

- createVectorIntInductionPHI(ID, EntryVal);

+ if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {

+ createVectorIntOrFpInductionPHI(ID, Step, EntryVal);

VectorizedIV = true;

}

// If we haven't yet vectorized the induction variable, or if we will create

// a scalar one, we need to define the scalar induction variable and step

// values. If we were given a truncation type, truncate the canonical

- // induction variable and constant step. Otherwise, derive these values from

- // the induction descriptor.

+ // induction variable and step. Otherwise, derive these values from the

+ // induction descriptor.

if (!VectorizedIV || NeedsScalarIV) {

+ ScalarIV = Induction;

+ if (IV != OldInduction) {

+ ScalarIV = IV->getType()->isIntegerTy()

+ ? Builder.CreateSExtOrTrunc(Induction, IV->getType())

+ : Builder.CreateCast(Instruction::SIToFP, Induction,

+ IV->getType());

+ ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);

+ ScalarIV->setName("offset.idx");

+ }

if (Trunc) {

auto *TruncType = cast<IntegerType>(Trunc->getType());

- assert(Step && "Truncation requires constant integer step");

- auto StepInt = cast<ConstantInt>(Step)->getSExtValue();

- ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);

- Step = ConstantInt::getSigned(TruncType, StepInt);

- } else {

- ScalarIV = Induction;

- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

- if (IV != OldInduction) {

- ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());

- ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);

- ScalarIV->setName("offset.idx");

- }

- if (!Step) {

- SCEVExpander Exp(*PSE.getSE(), DL, "induction");

- Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),

- &*Builder.GetInsertPoint());

- }

+ assert(Step->getType()->isIntegerTy() &&

+ "Truncation requires an integer step");

+ ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);

+ Step = Builder.CreateTrunc(Step, TruncType);

}

@@ -2314,7 +2517,8 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {

Value *Broadcasted = getBroadcastInstrs(ScalarIV);

VectorParts Entry(UF);

for (unsigned Part = 0; Part < UF; ++Part)

- Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);

+ Entry[Part] =

+ getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());

VectorLoopValueMap.initVector(EntryVal, Entry);

if (Trunc)

addMetadata(Entry, Trunc);

@@ -2327,7 +2531,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {

// in the loop in the common case prior to InstCombine. We will be trading

// one vector extract for each scalar step.

if (NeedsScalarIV)

- buildScalarSteps(ScalarIV, Step, EntryVal);

+ buildScalarSteps(ScalarIV, Step, EntryVal, ID);

}

Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,

@@ -2387,30 +2591,43 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,

}

void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,

- Value *EntryVal) {

+ Value *EntryVal,

+ const InductionDescriptor &ID) {

// We shouldn't have to build scalar steps if we aren't vectorizing.

assert(VF > 1 && "VF should be greater than one");

// Get the value type and ensure it and the step have the same integer type.

Type *ScalarIVTy = ScalarIV->getType()->getScalarType();

- assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&

- "Val and Step should have the same integer type");

+ assert(ScalarIVTy == Step->getType() &&

+ "Val and Step should have the same type");

+ // We build scalar steps for both integer and floating-point induction

+ // variables. Here, we determine the kind of arithmetic we will perform.

+ Instruction::BinaryOps AddOp;

+ Instruction::BinaryOps MulOp;

+ if (ScalarIVTy->isIntegerTy()) {

+ AddOp = Instruction::Add;

+ MulOp = Instruction::Mul;

+ } else {

+ AddOp = ID.getInductionOpcode();

+ MulOp = Instruction::FMul;

+ }

// Determine the number of scalars we need to generate for each unroll

// iteration. If EntryVal is uniform, we only need to generate the first

// lane. Otherwise, we generate all VF values.

unsigned Lanes =

- Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;

+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;

// Compute the scalar steps and save the results in VectorLoopValueMap.

ScalarParts Entry(UF);

for (unsigned Part = 0; Part < UF; ++Part) {

Entry[Part].resize(VF);

for (unsigned Lane = 0; Lane < Lanes; ++Lane) {

- auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);

- auto *Mul = Builder.CreateMul(StartIdx, Step);

- auto *Add = Builder.CreateAdd(ScalarIV, Mul);

+ auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);

+ auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));

+ auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));

Entry[Part][Lane] = Add;

}

@@ -2469,7 +2686,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {

// known to be uniform after vectorization, this corresponds to lane zero

// of the last unroll iteration. Otherwise, the last instruction is the one

// we created for the last vector lane of the last unroll iteration.

- unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;

+ unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;

auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));

// Set the insert point after the last scalarized instruction. This ensures

@@ -2486,7 +2703,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {

// VectorLoopValueMap, we will only generate the insertelements once.

for (unsigned Part = 0; Part < UF; ++Part) {

Value *VectorValue = nullptr;

- if (Legal->isUniformAfterVectorization(I)) {

+ if (Cost->isUniformAfterVectorization(I, VF)) {

VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));

} else {

VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));

@@ -2515,8 +2732,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,

if (OrigLoop->isLoopInvariant(V))

return V;

- assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))

- : true && "Uniform values only have lane zero");

+ assert(Lane > 0 ?

+ !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)

+ : true && "Uniform values only have lane zero");

// If the value from the original loop has not been vectorized, it is

// represented by UF x VF scalar values in the new loop. Return the requested

@@ -2551,102 +2769,6 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {

"reverse");

}

-// Get a mask to interleave \p NumVec vectors into a wide vector.

-// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>

-// E.g. For 2 interleaved vectors, if VF is 4, the mask is:

-// <0, 4, 1, 5, 2, 6, 3, 7>

-static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,

- unsigned NumVec) {

- SmallVector<Constant *, 16> Mask;

- for (unsigned i = 0; i < VF; i++)

- for (unsigned j = 0; j < NumVec; j++)

- Mask.push_back(Builder.getInt32(j * VF + i));

- return ConstantVector::get(Mask);

-// Get the strided mask starting from index \p Start.

-// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)>

-static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,

- unsigned Stride, unsigned VF) {

- SmallVector<Constant *, 16> Mask;

- for (unsigned i = 0; i < VF; i++)

- Mask.push_back(Builder.getInt32(Start + i * Stride));

- return ConstantVector::get(Mask);

-// Get a mask of two parts: The first part consists of sequential integers

-// starting from 0, The second part consists of UNDEFs.

-// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>

-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,

- unsigned NumUndef) {

- SmallVector<Constant *, 16> Mask;

- for (unsigned i = 0; i < NumInt; i++)

- Mask.push_back(Builder.getInt32(i));

- Constant *Undef = UndefValue::get(Builder.getInt32Ty());

- for (unsigned i = 0; i < NumUndef; i++)

- Mask.push_back(Undef);

- return ConstantVector::get(Mask);

-// Concatenate two vectors with the same element type. The 2nd vector should

-// not have more elements than the 1st vector. If the 2nd vector has less

-// elements, extend it with UNDEFs.

-static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,

- Value *V2) {

- VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());

- VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());

- assert(VecTy1 && VecTy2 &&

- VecTy1->getScalarType() == VecTy2->getScalarType() &&

- "Expect two vectors with the same element type");

- unsigned NumElts1 = VecTy1->getNumElements();

- unsigned NumElts2 = VecTy2->getNumElements();

- assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");

- if (NumElts1 > NumElts2) {

- // Extend with UNDEFs.

- Constant *ExtMask =

- getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);

- V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);

- }

- Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);

- return Builder.CreateShuffleVector(V1, V2, Mask);

-// Concatenate vectors in the given list. All vectors have the same type.

-static Value *ConcatenateVectors(IRBuilder<> &Builder,

- ArrayRef<Value *> InputList) {

- unsigned NumVec = InputList.size();

- assert(NumVec > 1 && "Should be at least two vectors");

- SmallVector<Value *, 8> ResList;

- ResList.append(InputList.begin(), InputList.end());

- do {

- SmallVector<Value *, 8> TmpList;

- for (unsigned i = 0; i < NumVec - 1; i += 2) {

- Value *V0 = ResList[i], *V1 = ResList[i + 1];

- assert((V0->getType() == V1->getType() || i == NumVec - 2) &&

- "Only the last vector may have a different type");

- TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));

- }

- // Push the last vector if the total number of vectors is odd.

- if (NumVec % 2 != 0)

- TmpList.push_back(ResList[NumVec - 1]);

- ResList = TmpList;

- NumVec = ResList.size();

- } while (NumVec > 1);

- return ResList[0];

// Try to vectorize the interleave group that \p Instr belongs to.

// E.g. Translate following interleaved load group (factor = 3):

@@ -2683,15 +2805,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {

if (Instr != Group->getInsertPos())

return;

- LoadInst *LI = dyn_cast<LoadInst>(Instr);

- StoreInst *SI = dyn_cast<StoreInst>(Instr);

Value *Ptr = getPointerOperand(Instr);

// Prepare for the vector type of the interleaved load/store.

- Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();

+ Type *ScalarTy = getMemInstValueType(Instr);

unsigned InterleaveFactor = Group->getFactor();

Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);

- Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());

+ Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));

// Prepare for the new pointers.

setDebugLocFromInst(Builder, Ptr);

@@ -2731,7 +2851,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {

Value *UndefVec = UndefValue::get(VecTy);

// Vectorize the interleaved load group.

- if (LI) {

+ if (isa<LoadInst>(Instr)) {

// For each unroll part, create a wide load for the group.

SmallVector<Value *, 2> NewLoads;

@@ -2752,7 +2872,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {

continue;

VectorParts Entry(UF);

- Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF);

+ Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);

for (unsigned Part = 0; Part < UF; Part++) {

Value *StridedVec = Builder.CreateShuffleVector(

NewLoads[Part], UndefVec, StrideMask, "strided.vec");

@@ -2796,10 +2916,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {

}

// Concatenate all vectors into a wide vector.

- Value *WideVec = ConcatenateVectors(Builder, StoredVecs);

+ Value *WideVec = concatenateVectors(Builder, StoredVecs);

// Interleave the elements in the wide vector.

- Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);

+ Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);

Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,

"interleaved.vec");

@@ -2816,103 +2936,44 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {

assert((LI || SI) && "Invalid Load/Store instruction");

- // Try to vectorize the interleave group if this access is interleaved.

- if (Legal->isAccessInterleaved(Instr))

+ LoopVectorizationCostModel::InstWidening Decision =

+ Cost->getWideningDecision(Instr, VF);

+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&

+ "CM decision should be taken at this point");

+ if (Decision == LoopVectorizationCostModel::CM_Interleave)

return vectorizeInterleaveGroup(Instr);

- Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();

+ Type *ScalarDataTy = getMemInstValueType(Instr);

Type *DataTy = VectorType::get(ScalarDataTy, VF);

Value *Ptr = getPointerOperand(Instr);

- unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();

+ unsigned Alignment = getMemInstAlignment(Instr);

// An alignment of 0 means target abi alignment. We need to use the scalar's

// target abi alignment in such a case.

const DataLayout &DL = Instr->getModule()->getDataLayout();

if (!Alignment)

Alignment = DL.getABITypeAlignment(ScalarDataTy);

- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

+ unsigned AddressSpace = getMemInstAddressSpace(Instr);

// Scalarize the memory instruction if necessary.

- if (Legal->memoryInstructionMustBeScalarized(Instr, VF))

+ if (Decision == LoopVectorizationCostModel::CM_Scalarize)

return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));

// Determine if the pointer operand of the access is either consecutive or

// reverse consecutive.

int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

bool Reverse = ConsecutiveStride < 0;

- // Determine if either a gather or scatter operation is legal.

bool CreateGatherScatter =

- !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);

+ (Decision == LoopVectorizationCostModel::CM_GatherScatter);

VectorParts VectorGep;

// Handle consecutive loads/stores.

- GetElementPtrInst *Gep = getGEPInstruction(Ptr);

if (ConsecutiveStride) {

- if (Gep) {

- unsigned NumOperands = Gep->getNumOperands();

-#ifndef NDEBUG

- // The original GEP that identified as a consecutive memory access

- // should have only one loop-variant operand.

- unsigned NumOfLoopVariantOps = 0;

- for (unsigned i = 0; i < NumOperands; ++i)

- if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),

- OrigLoop))

- NumOfLoopVariantOps++;

- assert(NumOfLoopVariantOps == 1 &&

- "Consecutive GEP should have only one loop-variant operand");

-#endif

- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());

- Gep2->setName("gep.indvar");

- // A new GEP is created for a 0-lane value of the first unroll iteration.

- // The GEPs for the rest of the unroll iterations are computed below as an

- // offset from this GEP.

- for (unsigned i = 0; i < NumOperands; ++i)

- // We can apply getScalarValue() for all GEP indices. It returns an

- // original value for loop-invariant operand and 0-lane for consecutive

- // operand.

- Gep2->setOperand(i, getScalarValue(Gep->getOperand(i),

- 0, /* First unroll iteration */

- 0 /* 0-lane of the vector */ ));

- setDebugLocFromInst(Builder, Gep);

- Ptr = Builder.Insert(Gep2);

- } else { // No GEP

- setDebugLocFromInst(Builder, Ptr);

- Ptr = getScalarValue(Ptr, 0, 0);

- }

+ Ptr = getScalarValue(Ptr, 0, 0);

} else {

// At this point we should vector version of GEP for Gather or Scatter

assert(CreateGatherScatter && "The instruction should be scalarized");

- if (Gep) {

- // Vectorizing GEP, across UF parts. We want to get a vector value for base

- // and each index that's defined inside the loop, even if it is

- // loop-invariant but wasn't hoisted out. Otherwise we want to keep them

- // scalar.

- SmallVector<VectorParts, 4> OpsV;

- for (Value *Op : Gep->operands()) {

- Instruction *SrcInst = dyn_cast<Instruction>(Op);

- if (SrcInst && OrigLoop->contains(SrcInst))

- OpsV.push_back(getVectorValue(Op));

- else

- OpsV.push_back(VectorParts(UF, Op));

- }

- for (unsigned Part = 0; Part < UF; ++Part) {

- SmallVector<Value *, 4> Ops;

- Value *GEPBasePtr = OpsV[0][Part];

- for (unsigned i = 1; i < Gep->getNumOperands(); i++)

- Ops.push_back(OpsV[i][Part]);

- Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");

- cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());

- assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");

- NewGep =

- Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));

- VectorGep.push_back(NewGep);

- }

- } else

- VectorGep = getVectorValue(Ptr);

+ VectorGep = getVectorValue(Ptr);

}

VectorParts Mask = createBlockInMask(Instr->getParent());

@@ -3027,7 +3088,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

// Determine the number of scalars we need to generate for each unroll

// iteration. If the instruction is uniform, we only need to generate the

// first lane. Otherwise, we generate all VF values.

- unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;

+ unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;

// For each vector unroll 'part':

for (unsigned Part = 0; Part < UF; ++Part) {

@@ -3038,7 +3099,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

// Start if-block.

Value *Cmp = nullptr;

if (IfPredicateInstr) {

- Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));

+ Cmp = Cond[Part];

+ if (Cmp->getType()->isVectorTy())

+ Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));

Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,

ConstantInt::get(Cmp->getType(), 1));

}

@@ -3346,7 +3409,7 @@ void InnerLoopVectorizer::createEmptyLoop() {

// - counts from zero, stepping by one

// - is the size of the widest induction variable type

// then we create a new one.

- OldInduction = Legal->getInduction();

+ OldInduction = Legal->getPrimaryInduction();

Type *IdxTy = Legal->getWidestInductionType();

// Split the single block loop into the two loop structure described above.

@@ -3543,7 +3606,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,

namespace {

struct CSEDenseMapInfo {

- static bool canHandle(Instruction *I) {

+ static bool canHandle(const Instruction *I) {

return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||

isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);

}

@@ -3553,12 +3616,12 @@ struct CSEDenseMapInfo {

static inline Instruction *getTombstoneKey() {

return DenseMapInfo<Instruction *>::getTombstoneKey();

}

- static unsigned getHashValue(Instruction *I) {

+ static unsigned getHashValue(const Instruction *I) {

assert(canHandle(I) && "Unknown instruction!");

return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),

I->value_op_end()));

}

- static bool isEqual(Instruction *LHS, Instruction *RHS) {

+ static bool isEqual(const Instruction *LHS, const Instruction *RHS) {

if (LHS == getEmptyKey() || RHS == getEmptyKey() ||

LHS == getTombstoneKey() || RHS == getTombstoneKey())

return LHS == RHS;

@@ -3589,51 +3652,6 @@ static void cse(BasicBlock *BB) {

}

-/// \brief Adds a 'fast' flag to floating point operations.

-static Value *addFastMathFlag(Value *V) {

- if (isa<FPMathOperator>(V)) {

- FastMathFlags Flags;

- Flags.setUnsafeAlgebra();

- cast<Instruction>(V)->setFastMathFlags(Flags);

- }

- return V;

-/// \brief Estimate the overhead of scalarizing a value based on its type.

-/// Insert and Extract are set if the result needs to be inserted and/or

-/// extracted from vectors.

-static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,

- const TargetTransformInfo &TTI) {

- if (Ty->isVoidTy())

- return 0;

- assert(Ty->isVectorTy() && "Can only scalarize vectors");

- unsigned Cost = 0;

- for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {

- if (Extract)

- Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);

- if (Insert)

- Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);

- }

- return Cost;

-/// \brief Estimate the overhead of scalarizing an Instruction based on the

-/// types of its operands and return value.

-static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,

- Type *RetTy,

- const TargetTransformInfo &TTI) {

- unsigned ScalarizationCost =

- getScalarizationOverhead(RetTy, true, false, TTI);

- for (Type *Ty : OpTys)

- ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);

- return ScalarizationCost;

/// \brief Estimate the overhead of scalarizing an instruction. This is a

/// convenience wrapper for the type-based getScalarizationOverhead API.

static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,

@@ -3641,14 +3659,24 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,

if (VF == 1)

return 0;

+ unsigned Cost = 0;

Type *RetTy = ToVectorTy(I->getType(), VF);

+ if (!RetTy->isVoidTy() &&

+ (!isa<LoadInst>(I) ||

+ !TTI.supportsEfficientVectorElementLoadStore()))

+ Cost += TTI.getScalarizationOverhead(RetTy, true, false);

- SmallVector<Type *, 4> OpTys;

- unsigned OperandsNum = I->getNumOperands();

- for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)

- OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));

+ if (CallInst *CI = dyn_cast<CallInst>(I)) {

+ SmallVector<const Value *, 4> Operands(CI->arg_operands());

+ Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);

+ }

+ else if (!isa<StoreInst>(I) ||

+ !TTI.supportsEfficientVectorElementLoadStore()) {

+ SmallVector<const Value *, 4> Operands(I->operand_values());

+ Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);

+ }

- return getScalarizationOverhead(OpTys, RetTy, TTI);

+ return Cost;

}

// Estimate cost of a call instruction CI if it were vectorized with factor VF.

@@ -3681,7 +3709,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,

// Compute costs of unpacking argument values for the scalar calls and

// packing the return values to a vector.

- unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);

+ unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);

unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

@@ -3709,16 +3737,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

assert(ID && "Expected intrinsic call!");

- Type *RetTy = ToVectorTy(CI->getType(), VF);

- SmallVector<Type *, 4> Tys;

- for (Value *ArgOperand : CI->arg_operands())

- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

FastMathFlags FMF;

if (auto *FPMO = dyn_cast<FPMathOperator>(CI))

FMF = FPMO->getFastMathFlags();

- return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);

+ SmallVector<Value *, 4> Operands(CI->arg_operands());

+ return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);

}

static Type *smallestIntegerVectorType(Type *T1, Type *T2) {

@@ -3861,30 +3885,27 @@ void InnerLoopVectorizer::vectorizeLoop() {

// the cost-model.

//===------------------------------------------------===//

- Constant *Zero = Builder.getInt32(0);

- // In order to support recurrences we need to be able to vectorize Phi nodes.

- // Phi nodes have cycles, so we need to vectorize them in two stages. First,

- // we create a new vector PHI node with no incoming edges. We use this value

- // when we vectorize all of the instructions that use the PHI. Next, after

- // all of the instructions in the block are complete we add the new incoming

- // edges to the PHI. At this point all of the instructions in the basic block

- // are vectorized, so we can use them to construct the PHI.

- PhiVector PHIsToFix;

- // Collect instructions from the original loop that will become trivially

- // dead in the vectorized loop. We don't need to vectorize these

- // instructions.

- collectTriviallyDeadInstructions();

+ // Collect instructions from the original loop that will become trivially dead

+ // in the vectorized loop. We don't need to vectorize these instructions. For

+ // example, original induction update instructions can become dead because we

+ // separately emit induction "steps" when generating code for the new loop.

+ // Similarly, we create a new latch condition when setting up the structure

+ // of the new loop, so the old one can become dead.

+ SmallPtrSet<Instruction *, 4> DeadInstructions;

+ collectTriviallyDeadInstructions(DeadInstructions);

// Scan the loop in a topological order to ensure that defs are vectorized

// before users.

LoopBlocksDFS DFS(OrigLoop);

DFS.perform(LI);

- // Vectorize all of the blocks in the original loop.

+ // Vectorize all instructions in the original loop that will not become

+ // trivially dead when vectorized.

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))

- vectorizeBlockInLoop(BB, &PHIsToFix);

+ for (Instruction &I : *BB)

+ if (!DeadInstructions.count(&I))

+ vectorizeInstruction(I);

// Insert truncates and extends for any truncated instructions as hints to

// InstCombine.

@@ -3892,224 +3913,10 @@ void InnerLoopVectorizer::vectorizeLoop() {

truncateToMinimalBitwidths();

// At this point every instruction in the original loop is widened to a

- // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI

+ // vector form. Now we need to fix the recurrences in the loop. These PHI

// nodes are currently empty because we did not want to introduce cycles.

// This is the second stage of vectorizing recurrences.

- for (PHINode *Phi : PHIsToFix) {

- assert(Phi && "Unable to recover vectorized PHI");

- // Handle first-order recurrences that need to be fixed.

- if (Legal->isFirstOrderRecurrence(Phi)) {

- fixFirstOrderRecurrence(Phi);

- continue;

- }

- // If the phi node is not a first-order recurrence, it must be a reduction.

- // Get it's reduction variable descriptor.

- assert(Legal->isReductionVariable(Phi) &&

- "Unable to find the reduction variable");

- RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];

- RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();

- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();

- Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();

- RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =

- RdxDesc.getMinMaxRecurrenceKind();

- setDebugLocFromInst(Builder, ReductionStartValue);

- // We need to generate a reduction vector from the incoming scalar.

- // To do so, we need to generate the 'identity' vector and override

- // one of the elements with the incoming scalar reduction. We need

- // to do it in the vector-loop preheader.

- Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());

- // This is the vector-clone of the value that leaves the loop.

- const VectorParts &VectorExit = getVectorValue(LoopExitInst);

- Type *VecTy = VectorExit[0]->getType();

- // Find the reduction identity variable. Zero for addition, or, xor,

- // one for multiplication, -1 for And.

- Value *Identity;

- Value *VectorStart;

- if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||

- RK == RecurrenceDescriptor::RK_FloatMinMax) {

- // MinMax reduction have the start value as their identify.

- if (VF == 1) {

- VectorStart = Identity = ReductionStartValue;

- } else {

- VectorStart = Identity =

- Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");

- }

- } else {

- // Handle other reduction kinds:

- Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(

- RK, VecTy->getScalarType());

- if (VF == 1) {

- Identity = Iden;

- // This vector is the Identity vector where the first element is the

- // incoming scalar reduction.

- VectorStart = ReductionStartValue;

- } else {

- Identity = ConstantVector::getSplat(VF, Iden);

- // This vector is the Identity vector where the first element is the

- // incoming scalar reduction.

- VectorStart =

- Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);

- }

- // Fix the vector-loop phi.

- // Reductions do not have to start at zero. They can start with

- // any loop invariant values.

- const VectorParts &VecRdxPhi = getVectorValue(Phi);

- BasicBlock *Latch = OrigLoop->getLoopLatch();

- Value *LoopVal = Phi->getIncomingValueForBlock(Latch);

- const VectorParts &Val = getVectorValue(LoopVal);

- for (unsigned part = 0; part < UF; ++part) {

- // Make sure to add the reduction stat value only to the

- // first unroll part.

- Value *StartVal = (part == 0) ? VectorStart : Identity;

- cast<PHINode>(VecRdxPhi[part])

- ->addIncoming(StartVal, LoopVectorPreHeader);

- cast<PHINode>(VecRdxPhi[part])

- ->addIncoming(Val[part], LoopVectorBody);

- }

- // Before each round, move the insertion point right between

- // the PHIs and the values we are going to write.

- // This allows us to write both PHINodes and the extractelement

- // instructions.

- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

- VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);

- setDebugLocFromInst(Builder, LoopExitInst);

- // If the vector reduction can be performed in a smaller type, we truncate

- // then extend the loop exit value to enable InstCombine to evaluate the

- // entire expression in the smaller type.

- if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {

- Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);

- Builder.SetInsertPoint(LoopVectorBody->getTerminator());

- for (unsigned part = 0; part < UF; ++part) {

- Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);

- Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)

- : Builder.CreateZExt(Trunc, VecTy);

- for (Value::user_iterator UI = RdxParts[part]->user_begin();

- UI != RdxParts[part]->user_end();)

- if (*UI != Trunc) {

- (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);

- RdxParts[part] = Extnd;

- } else {

- ++UI;

- }

- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

- for (unsigned part = 0; part < UF; ++part)

- RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);

- }

- // Reduce all of the unrolled parts into a single vector.

- Value *ReducedPartRdx = RdxParts[0];

- unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);

- setDebugLocFromInst(Builder, ReducedPartRdx);

- for (unsigned part = 1; part < UF; ++part) {

- if (Op != Instruction::ICmp && Op != Instruction::FCmp)

- // Floating point operations had to be 'fast' to enable the reduction.

- ReducedPartRdx = addFastMathFlag(

- Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],

- ReducedPartRdx, "bin.rdx"));

- else

- ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(

- Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);

- }

- if (VF > 1) {

- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles

- // and vector ops, reducing the set of values being computed by half each

- // round.

- assert(isPowerOf2_32(VF) &&

- "Reduction emission only supported for pow2 vectors!");

- Value *TmpVec = ReducedPartRdx;

- SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);

- for (unsigned i = VF; i != 1; i >>= 1) {

- // Move the upper half of the vector to the lower half.

- for (unsigned j = 0; j != i / 2; ++j)

- ShuffleMask[j] = Builder.getInt32(i / 2 + j);

- // Fill the rest of the mask with undef.

- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),

- UndefValue::get(Builder.getInt32Ty()));

- Value *Shuf = Builder.CreateShuffleVector(

- TmpVec, UndefValue::get(TmpVec->getType()),

- ConstantVector::get(ShuffleMask), "rdx.shuf");

- if (Op != Instruction::ICmp && Op != Instruction::FCmp)

- // Floating point operations had to be 'fast' to enable the reduction.

- TmpVec = addFastMathFlag(Builder.CreateBinOp(

- (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));

- else

- TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,

- TmpVec, Shuf);

- }

- // The result is in the first element of the vector.

- ReducedPartRdx =

- Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));

- // If the reduction can be performed in a smaller type, we need to extend

- // the reduction to the wider type before we branch to the original loop.

- if (Phi->getType() != RdxDesc.getRecurrenceType())

- ReducedPartRdx =

- RdxDesc.isSigned()

- ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())

- : Builder.CreateZExt(ReducedPartRdx, Phi->getType());

- }

- // Create a phi node that merges control-flow from the backedge-taken check

- // block and the middle block.

- PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",

- LoopScalarPreHeader->getTerminator());

- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)

- BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);

- BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

- // Now, we need to fix the users of the reduction variable

- // inside and outside of the scalar remainder loop.

- // We know that the loop is in LCSSA form. We need to update the

- // PHI nodes in the exit blocks.

- for (BasicBlock::iterator LEI = LoopExitBlock->begin(),

- LEE = LoopExitBlock->end();

- LEI != LEE; ++LEI) {

- PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);

- if (!LCSSAPhi)

- break;

- // All PHINodes need to have a single entry edge, or two if

- // we already fixed them.

- assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");

- // We found our reduction value exit-PHI. Update it with the

- // incoming bypass edge.

- if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {

- // Add an edge coming from the bypass.

- LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

- break;

- }

- } // end of the LCSSA phi scan.

- // Fix the scalar loop reduction variable with the incoming reduction sum

- // from the vector body and from the backedge value.

- int IncomingEdgeBlockIdx =

- Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());

- assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");

- // Pick the other block.

- int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);

- Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);

- Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);

- } // end of for each Phi in PHIsToFix.

+ fixCrossIterationPHIs();

// Update the dominator tree.

@@ -4134,6 +3941,25 @@ void InnerLoopVectorizer::vectorizeLoop() {

cse(LoopVectorBody);

}

+void InnerLoopVectorizer::fixCrossIterationPHIs() {

+ // In order to support recurrences we need to be able to vectorize Phi nodes.

+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is

+ // stage #2: We now need to fix the recurrences by adding incoming edges to

+ // the currently empty PHI nodes. At this point every instruction in the

+ // original loop is widened to a vector form so we can use them to construct

+ // the incoming edges.

+ for (Instruction &I : *OrigLoop->getHeader()) {

+ PHINode *Phi = dyn_cast<PHINode>(&I);

+ if (!Phi)

+ break;

+ // Handle first-order recurrences and reductions that need to be fixed.

+ if (Legal->isFirstOrderRecurrence(Phi))

+ fixFirstOrderRecurrence(Phi);

+ else if (Legal->isReductionVariable(Phi))

+ fixReduction(Phi);

+ }

void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

// This is the second phase of vectorizing first-order recurrences. An

@@ -4212,15 +4038,17 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");

VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);

- // Get the vectorized previous value. We ensured the previous values was an

- // instruction when detecting the recurrence.

+ // Get the vectorized previous value.

auto &PreviousParts = getVectorValue(Previous);

- // Set the insertion point to be after this instruction. We ensured the

- // previous value dominated all uses of the phi when detecting the

- // recurrence.

- Builder.SetInsertPoint(

- &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));

+ // Set the insertion point after the previous value if it is an instruction.

+ // Note that the previous value may have been constant-folded so it is not

+ // guaranteed to be an instruction in the vector loop.

+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))

+ Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());

+ else

+ Builder.SetInsertPoint(

+ &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));

// We will construct a vector for the recurrence by combining the values for

// the current and previous iterations. This is the required shuffle mask.

@@ -4251,18 +4079,33 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

// Extract the last vector element in the middle block. This will be the

// initial value for the recurrence when jumping to the scalar loop.

- auto *Extract = Incoming;

+ auto *ExtractForScalar = Incoming;

if (VF > 1) {

Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());

- Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),

- "vector.recur.extract");

- }

+ ExtractForScalar = Builder.CreateExtractElement(

+ ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");

+ }

+ // Extract the second last element in the middle block if the

+ // Phi is used outside the loop. We need to extract the phi itself

+ // and not the last element (the phi update in the current iteration). This

+ // will be the value when jumping to the exit block from the LoopMiddleBlock,

+ // when the scalar loop is not run at all.

+ Value *ExtractForPhiUsedOutsideLoop = nullptr;

+ if (VF > 1)

+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(

+ Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");

+ // When loop is unrolled without vectorizing, initialize

+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of

+ // `Incoming`. This is analogous to the vectorized case above: extracting the

+ // second last element when VF > 1.

+ else if (UF > 1)

+ ExtractForPhiUsedOutsideLoop = PreviousParts[UF - 2];

// Fix the initial value of the original recurrence in the scalar loop.

Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());

auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");

for (auto *BB : predecessors(LoopScalarPreHeader)) {

- auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;

+ auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;

Start->addIncoming(Incoming, BB);

}

@@ -4279,12 +4122,218 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {

if (!LCSSAPhi)

break;

if (LCSSAPhi->getIncomingValue(0) == Phi) {

- LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);

+ LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);

break;

}

+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {

+ Constant *Zero = Builder.getInt32(0);

+ // Get it's reduction variable descriptor.

+ assert(Legal->isReductionVariable(Phi) &&

+ "Unable to find the reduction variable");

+ RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];

+ RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();

+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();

+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();

+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =

+ RdxDesc.getMinMaxRecurrenceKind();

+ setDebugLocFromInst(Builder, ReductionStartValue);

+ // We need to generate a reduction vector from the incoming scalar.

+ // To do so, we need to generate the 'identity' vector and override

+ // one of the elements with the incoming scalar reduction. We need

+ // to do it in the vector-loop preheader.

+ Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());

+ // This is the vector-clone of the value that leaves the loop.

+ const VectorParts &VectorExit = getVectorValue(LoopExitInst);

+ Type *VecTy = VectorExit[0]->getType();

+ // Find the reduction identity variable. Zero for addition, or, xor,

+ // one for multiplication, -1 for And.

+ Value *Identity;

+ Value *VectorStart;

+ if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||

+ RK == RecurrenceDescriptor::RK_FloatMinMax) {

+ // MinMax reduction have the start value as their identify.

+ if (VF == 1) {

+ VectorStart = Identity = ReductionStartValue;

+ } else {

+ VectorStart = Identity =

+ Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");

+ }

+ } else {

+ // Handle other reduction kinds:

+ Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(

+ RK, VecTy->getScalarType());

+ if (VF == 1) {

+ Identity = Iden;

+ // This vector is the Identity vector where the first element is the

+ // incoming scalar reduction.

+ VectorStart = ReductionStartValue;

+ } else {

+ Identity = ConstantVector::getSplat(VF, Iden);

+ // This vector is the Identity vector where the first element is the

+ // incoming scalar reduction.

+ VectorStart =

+ Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);

+ }

+ // Fix the vector-loop phi.

+ // Reductions do not have to start at zero. They can start with

+ // any loop invariant values.

+ const VectorParts &VecRdxPhi = getVectorValue(Phi);

+ BasicBlock *Latch = OrigLoop->getLoopLatch();

+ Value *LoopVal = Phi->getIncomingValueForBlock(Latch);

+ const VectorParts &Val = getVectorValue(LoopVal);

+ for (unsigned part = 0; part < UF; ++part) {

+ // Make sure to add the reduction stat value only to the

+ // first unroll part.

+ Value *StartVal = (part == 0) ? VectorStart : Identity;

+ cast<PHINode>(VecRdxPhi[part])

+ ->addIncoming(StartVal, LoopVectorPreHeader);

+ cast<PHINode>(VecRdxPhi[part])

+ ->addIncoming(Val[part], LI->getLoopFor(LoopVectorBody)->getLoopLatch());

+ }

+ // Before each round, move the insertion point right between

+ // the PHIs and the values we are going to write.

+ // This allows us to write both PHINodes and the extractelement

+ // instructions.

+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

+ VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);

+ setDebugLocFromInst(Builder, LoopExitInst);

+ // If the vector reduction can be performed in a smaller type, we truncate

+ // then extend the loop exit value to enable InstCombine to evaluate the

+ // entire expression in the smaller type.

+ if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {

+ Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);

+ Builder.SetInsertPoint(LoopVectorBody->getTerminator());

+ for (unsigned part = 0; part < UF; ++part) {

+ Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);

+ Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)

+ : Builder.CreateZExt(Trunc, VecTy);

+ for (Value::user_iterator UI = RdxParts[part]->user_begin();

+ UI != RdxParts[part]->user_end();)

+ if (*UI != Trunc) {

+ (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);

+ RdxParts[part] = Extnd;

+ } else {

+ ++UI;

+ }

+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

+ for (unsigned part = 0; part < UF; ++part)

+ RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);

+ }

+ // Reduce all of the unrolled parts into a single vector.

+ Value *ReducedPartRdx = RdxParts[0];

+ unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);

+ setDebugLocFromInst(Builder, ReducedPartRdx);

+ for (unsigned part = 1; part < UF; ++part) {

+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)

+ // Floating point operations had to be 'fast' to enable the reduction.

+ ReducedPartRdx = addFastMathFlag(

+ Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],

+ ReducedPartRdx, "bin.rdx"));

+ else

+ ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(

+ Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);

+ }

+ if (VF > 1) {

+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles

+ // and vector ops, reducing the set of values being computed by half each

+ // round.

+ assert(isPowerOf2_32(VF) &&

+ "Reduction emission only supported for pow2 vectors!");

+ Value *TmpVec = ReducedPartRdx;

+ SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);

+ for (unsigned i = VF; i != 1; i >>= 1) {

+ // Move the upper half of the vector to the lower half.

+ for (unsigned j = 0; j != i / 2; ++j)

+ ShuffleMask[j] = Builder.getInt32(i / 2 + j);

+ // Fill the rest of the mask with undef.

+ std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),

+ UndefValue::get(Builder.getInt32Ty()));

+ Value *Shuf = Builder.CreateShuffleVector(

+ TmpVec, UndefValue::get(TmpVec->getType()),

+ ConstantVector::get(ShuffleMask), "rdx.shuf");

+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)

+ // Floating point operations had to be 'fast' to enable the reduction.

+ TmpVec = addFastMathFlag(Builder.CreateBinOp(

+ (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));

+ else

+ TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,

+ TmpVec, Shuf);

+ }

+ // The result is in the first element of the vector.

+ ReducedPartRdx =

+ Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));

+ // If the reduction can be performed in a smaller type, we need to extend

+ // the reduction to the wider type before we branch to the original loop.

+ if (Phi->getType() != RdxDesc.getRecurrenceType())

+ ReducedPartRdx =

+ RdxDesc.isSigned()

+ ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())

+ : Builder.CreateZExt(ReducedPartRdx, Phi->getType());

+ }

+ // Create a phi node that merges control-flow from the backedge-taken check

+ // block and the middle block.

+ PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",

+ LoopScalarPreHeader->getTerminator());

+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)

+ BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);

+ BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

+ // Now, we need to fix the users of the reduction variable

+ // inside and outside of the scalar remainder loop.

+ // We know that the loop is in LCSSA form. We need to update the

+ // PHI nodes in the exit blocks.

+ for (BasicBlock::iterator LEI = LoopExitBlock->begin(),

+ LEE = LoopExitBlock->end();

+ LEI != LEE; ++LEI) {

+ PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);

+ if (!LCSSAPhi)

+ break;

+ // All PHINodes need to have a single entry edge, or two if

+ // we already fixed them.

+ assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");

+ // We found a reduction value exit-PHI. Update it with the

+ // incoming bypass edge.

+ if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)

+ LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

+ } // end of the LCSSA phi scan.

+ // Fix the scalar loop reduction variable with the incoming reduction sum

+ // from the vector body and from the backedge value.

+ int IncomingEdgeBlockIdx =

+ Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());

+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");

+ // Pick the other block.

+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);

+ Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);

+ Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);

void InnerLoopVectorizer::fixLCSSAPHIs() {

for (Instruction &LEI : *LoopExitBlock) {

auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);

@@ -4296,7 +4345,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {

}

-void InnerLoopVectorizer::collectTriviallyDeadInstructions() {

+void InnerLoopVectorizer::collectTriviallyDeadInstructions(

+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {

BasicBlock *Latch = OrigLoop->getLoopLatch();

// We create new control-flow for the vectorized loop, so the original

@@ -4563,9 +4613,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {

}

void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

- unsigned VF, PhiVector *PV) {

+ unsigned VF) {

PHINode *P = cast<PHINode>(PN);

- // Handle recurrences.

+ // In order to support recurrences we need to be able to vectorize Phi nodes.

+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is

+ // stage #1: We create a new vector PHI node with no incoming edges. We'll use

+ // this value when we vectorize all of the instructions that use the PHI.

if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {

VectorParts Entry(UF);

for (unsigned part = 0; part < UF; ++part) {

@@ -4576,7 +4629,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());

}

VectorLoopValueMap.initVector(P, Entry);

- PV->push_back(P);

return;

}

@@ -4631,7 +4683,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

case InductionDescriptor::IK_NoInduction:

llvm_unreachable("Unknown induction");

case InductionDescriptor::IK_IntInduction:

- return widenIntInduction(P);

+ case InductionDescriptor::IK_FpInduction:

+ return widenIntOrFpInduction(P);

case InductionDescriptor::IK_PtrInduction: {

// Handle the pointer induction variable case.

assert(P->getType()->isPointerTy() && "Unexpected type.");

@@ -4641,7 +4694,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

// Determine the number of scalars we need to generate for each unroll

// iteration. If the instruction is uniform, we only need to generate the

// first lane. Otherwise, we generate all VF values.

- unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;

+ unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;

// These are the scalar results. Notice that we don't generate vector GEPs

// because scalar GEPs result in better code.

ScalarParts Entry(UF);

@@ -4658,30 +4711,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,

VectorLoopValueMap.initScalar(P, Entry);

return;

}

- case InductionDescriptor::IK_FpInduction: {

- assert(P->getType() == II.getStartValue()->getType() &&

- "Types must match");

- // Handle other induction variables that are now based on the

- // canonical one.

- assert(P != OldInduction && "Primary induction can be integer only");

- Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());

- V = II.transform(Builder, V, PSE.getSE(), DL);

- V->setName("fp.offset.idx");

- // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal

- Value *Broadcasted = getBroadcastInstrs(V);

- // After broadcasting the induction variable we need to make the vector

- // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.

- Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();

- VectorParts Entry(UF);

- for (unsigned part = 0; part < UF; ++part)

- Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,

- II.getInductionOpcode());

- VectorLoopValueMap.initVector(P, Entry);

- return;

- }

}

@@ -4703,269 +4732,323 @@ static bool mayDivideByZero(Instruction &I) {

return !CInt || CInt->isZero();

}

-void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {

- // For each instruction in the old loop.

- for (Instruction &I : *BB) {

- // If the instruction will become trivially dead when vectorized, we don't

- // need to generate it.

- if (DeadInstructions.count(&I))

- continue;

+void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {

+ // Scalarize instructions that should remain scalar after vectorization.

+ if (VF > 1 &&

+ !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&

+ shouldScalarizeInstruction(&I)) {

+ scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));

+ return;

+ }

- // Scalarize instructions that should remain scalar after vectorization.

- if (VF > 1 &&

- !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||

- isa<DbgInfoIntrinsic>(&I)) &&

- shouldScalarizeInstruction(&I)) {

- scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));

- continue;

- }

+ switch (I.getOpcode()) {

+ case Instruction::Br:

+ // Nothing to do for PHIs and BR, since we already took care of the

+ // loop control flow instructions.

+ break;

+ case Instruction::PHI: {

+ // Vectorize PHINodes.

+ widenPHIInstruction(&I, UF, VF);

+ break;

+ } // End of PHI.

+ case Instruction::GetElementPtr: {

+ // Construct a vector GEP by widening the operands of the scalar GEP as

+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP

+ // results in a vector of pointers when at least one operand of the GEP

+ // is vector-typed. Thus, to keep the representation compact, we only use

+ // vector-typed operands for loop-varying values.

+ auto *GEP = cast<GetElementPtrInst>(&I);

+ VectorParts Entry(UF);

- switch (I.getOpcode()) {

- case Instruction::Br:

- // Nothing to do for PHIs and BR, since we already took care of the

- // loop control flow instructions.

- continue;

- case Instruction::PHI: {

- // Vectorize PHINodes.

- widenPHIInstruction(&I, UF, VF, PV);

- continue;

- } // End of PHI.

- case Instruction::UDiv:

- case Instruction::SDiv:

- case Instruction::SRem:

- case Instruction::URem:

- // Scalarize with predication if this instruction may divide by zero and

- // block execution is conditional, otherwise fallthrough.

- if (Legal->isScalarWithPredication(&I)) {

- scalarizeInstruction(&I, true);

- continue;

- }

- case Instruction::Add:

- case Instruction::FAdd:

- case Instruction::Sub:

- case Instruction::FSub:

- case Instruction::Mul:

- case Instruction::FMul:

- case Instruction::FDiv:

- case Instruction::FRem:

- case Instruction::Shl:

- case Instruction::LShr:

- case Instruction::AShr:

- case Instruction::And:

- case Instruction::Or:

- case Instruction::Xor: {

- // Just widen binops.

- auto *BinOp = cast<BinaryOperator>(&I);

- setDebugLocFromInst(Builder, BinOp);

- const VectorParts &A = getVectorValue(BinOp->getOperand(0));

- const VectorParts &B = getVectorValue(BinOp->getOperand(1));

- // Use this vector value for all users of the original instruction.

- VectorParts Entry(UF);

+ if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {

+ // If we are vectorizing, but the GEP has only loop-invariant operands,

+ // the GEP we build (by only using vector-typed operands for

+ // loop-varying values) would be a scalar pointer. Thus, to ensure we

+ // produce a vector of pointers, we need to either arbitrarily pick an

+ // operand to broadcast, or broadcast a clone of the original GEP.

+ // Here, we broadcast a clone of the original.

+ //

+ // TODO: If at some point we decide to scalarize instructions having

+ // loop-invariant operands, this special case will no longer be

+ // required. We would add the scalarization decision to

+ // collectLoopScalars() and teach getVectorValue() to broadcast

+ // the lane-zero scalar value.

+ auto *Clone = Builder.Insert(GEP->clone());

+ for (unsigned Part = 0; Part < UF; ++Part)

+ Entry[Part] = Builder.CreateVectorSplat(VF, Clone);

+ } else {

+ // If the GEP has at least one loop-varying operand, we are sure to

+ // produce a vector of pointers. But if we are only unrolling, we want

+ // to produce a scalar GEP for each unroll part. Thus, the GEP we

+ // produce with the code below will be scalar (if VF == 1) or vector

+ // (otherwise). Note that for the unroll-only case, we still maintain

+ // values in the vector mapping with initVector, as we do for other

+ // instructions.

for (unsigned Part = 0; Part < UF; ++Part) {

- Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);

- if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))

- VecOp->copyIRFlags(BinOp);

+ // The pointer operand of the new GEP. If it's loop-invariant, we

+ // won't broadcast it.

+ auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())

+ ? GEP->getPointerOperand()

+ : getVectorValue(GEP->getPointerOperand())[Part];

+ // Collect all the indices for the new GEP. If any index is

+ // loop-invariant, we won't broadcast it.

+ SmallVector<Value *, 4> Indices;

+ for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {

+ if (OrigLoop->isLoopInvariant(U.get()))

+ Indices.push_back(U.get());

+ else

+ Indices.push_back(getVectorValue(U.get())[Part]);

+ }

- Entry[Part] = V;

+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,

+ // but it should be a vector, otherwise.

+ auto *NewGEP = GEP->isInBounds()

+ ? Builder.CreateInBoundsGEP(Ptr, Indices)

+ : Builder.CreateGEP(Ptr, Indices);

+ assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&

+ "NewGEP is not a pointer vector");

+ Entry[Part] = NewGEP;

}

+ }

- VectorLoopValueMap.initVector(&I, Entry);

- addMetadata(Entry, BinOp);

+ VectorLoopValueMap.initVector(&I, Entry);

+ addMetadata(Entry, GEP);

+ break;

+ }

+ case Instruction::UDiv:

+ case Instruction::SDiv:

+ case Instruction::SRem:

+ case Instruction::URem:

+ // Scalarize with predication if this instruction may divide by zero and

+ // block execution is conditional, otherwise fallthrough.

+ if (Legal->isScalarWithPredication(&I)) {

+ scalarizeInstruction(&I, true);

break;

}

- case Instruction::Select: {

- // Widen selects.

- // If the selector is loop invariant we can create a select

- // instruction with a scalar condition. Otherwise, use vector-select.

- auto *SE = PSE.getSE();

- bool InvariantCond =

- SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);

- setDebugLocFromInst(Builder, &I);

- // The condition can be loop invariant but still defined inside the

- // loop. This means that we can't just use the original 'cond' value.

- // We have to take the 'vectorized' value and pick the first lane.

- // Instcombine will make this a no-op.

- const VectorParts &Cond = getVectorValue(I.getOperand(0));

- const VectorParts &Op0 = getVectorValue(I.getOperand(1));

- const VectorParts &Op1 = getVectorValue(I.getOperand(2));

- auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);

+ case Instruction::Add:

+ case Instruction::FAdd:

+ case Instruction::Sub:

+ case Instruction::FSub:

+ case Instruction::Mul:

+ case Instruction::FMul:

+ case Instruction::FDiv:

+ case Instruction::FRem:

+ case Instruction::Shl:

+ case Instruction::LShr:

+ case Instruction::AShr:

+ case Instruction::And:

+ case Instruction::Or:

+ case Instruction::Xor: {

+ // Just widen binops.

+ auto *BinOp = cast<BinaryOperator>(&I);

+ setDebugLocFromInst(Builder, BinOp);

+ const VectorParts &A = getVectorValue(BinOp->getOperand(0));

+ const VectorParts &B = getVectorValue(BinOp->getOperand(1));

+ // Use this vector value for all users of the original instruction.

+ VectorParts Entry(UF);

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);

+ if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))

+ VecOp->copyIRFlags(BinOp);

+ Entry[Part] = V;

+ }

- VectorParts Entry(UF);

- for (unsigned Part = 0; Part < UF; ++Part) {

- Entry[Part] = Builder.CreateSelect(

- InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);

- }

+ VectorLoopValueMap.initVector(&I, Entry);

+ addMetadata(Entry, BinOp);

+ break;

+ }

+ case Instruction::Select: {

+ // Widen selects.

+ // If the selector is loop invariant we can create a select

+ // instruction with a scalar condition. Otherwise, use vector-select.

+ auto *SE = PSE.getSE();

+ bool InvariantCond =

+ SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);

+ setDebugLocFromInst(Builder, &I);

+ // The condition can be loop invariant but still defined inside the

+ // loop. This means that we can't just use the original 'cond' value.

+ // We have to take the 'vectorized' value and pick the first lane.

+ // Instcombine will make this a no-op.

+ const VectorParts &Cond = getVectorValue(I.getOperand(0));

+ const VectorParts &Op0 = getVectorValue(I.getOperand(1));

+ const VectorParts &Op1 = getVectorValue(I.getOperand(2));

+ auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);

- VectorLoopValueMap.initVector(&I, Entry);

- addMetadata(Entry, &I);

- break;

+ VectorParts Entry(UF);

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Entry[Part] = Builder.CreateSelect(

+ InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);

}

- case Instruction::ICmp:

- case Instruction::FCmp: {

- // Widen compares. Generate vector compares.

- bool FCmp = (I.getOpcode() == Instruction::FCmp);

- auto *Cmp = dyn_cast<CmpInst>(&I);

- setDebugLocFromInst(Builder, Cmp);

- const VectorParts &A = getVectorValue(Cmp->getOperand(0));

- const VectorParts &B = getVectorValue(Cmp->getOperand(1));

- VectorParts Entry(UF);

- for (unsigned Part = 0; Part < UF; ++Part) {

- Value *C = nullptr;

- if (FCmp) {

- C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);

- cast<FCmpInst>(C)->copyFastMathFlags(Cmp);

- } else {

- C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);

- }

- Entry[Part] = C;

+ VectorLoopValueMap.initVector(&I, Entry);

+ addMetadata(Entry, &I);

+ break;

+ }

+ case Instruction::ICmp:

+ case Instruction::FCmp: {

+ // Widen compares. Generate vector compares.

+ bool FCmp = (I.getOpcode() == Instruction::FCmp);

+ auto *Cmp = dyn_cast<CmpInst>(&I);

+ setDebugLocFromInst(Builder, Cmp);

+ const VectorParts &A = getVectorValue(Cmp->getOperand(0));

+ const VectorParts &B = getVectorValue(Cmp->getOperand(1));

+ VectorParts Entry(UF);

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ Value *C = nullptr;

+ if (FCmp) {

+ C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);

+ cast<FCmpInst>(C)->copyFastMathFlags(Cmp);

+ } else {

+ C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);

}

+ Entry[Part] = C;

+ }

+ VectorLoopValueMap.initVector(&I, Entry);

+ addMetadata(Entry, &I);

+ break;

+ }

- VectorLoopValueMap.initVector(&I, Entry);

- addMetadata(Entry, &I);

+ case Instruction::Store:

+ case Instruction::Load:

+ vectorizeMemoryInstruction(&I);

+ break;

+ case Instruction::ZExt:

+ case Instruction::SExt:

+ case Instruction::FPToUI:

+ case Instruction::FPToSI:

+ case Instruction::FPExt:

+ case Instruction::PtrToInt:

+ case Instruction::IntToPtr:

+ case Instruction::SIToFP:

+ case Instruction::UIToFP:

+ case Instruction::Trunc:

+ case Instruction::FPTrunc:

+ case Instruction::BitCast: {

+ auto *CI = dyn_cast<CastInst>(&I);

+ setDebugLocFromInst(Builder, CI);

+ // Optimize the special case where the source is a constant integer

+ // induction variable. Notice that we can only optimize the 'trunc' case

+ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and

+ // (c) other casts depend on pointer size.

+ if (Cost->isOptimizableIVTruncate(CI, VF)) {

+ widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),

+ cast<TruncInst>(CI));

break;

}

- case Instruction::Store:

- case Instruction::Load:

- vectorizeMemoryInstruction(&I);

+ /// Vectorize casts.

+ Type *DestTy =

+ (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);

+ const VectorParts &A = getVectorValue(CI->getOperand(0));

+ VectorParts Entry(UF);

+ for (unsigned Part = 0; Part < UF; ++Part)

+ Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);

+ VectorLoopValueMap.initVector(&I, Entry);

+ addMetadata(Entry, &I);

+ break;

+ }

+ case Instruction::Call: {

+ // Ignore dbg intrinsics.

+ if (isa<DbgInfoIntrinsic>(I))

break;

- case Instruction::ZExt:

- case Instruction::SExt:

- case Instruction::FPToUI:

- case Instruction::FPToSI:

- case Instruction::FPExt:

- case Instruction::PtrToInt:

- case Instruction::IntToPtr:

- case Instruction::SIToFP:

- case Instruction::UIToFP:

- case Instruction::Trunc:

- case Instruction::FPTrunc:

- case Instruction::BitCast: {

- auto *CI = dyn_cast<CastInst>(&I);

- setDebugLocFromInst(Builder, CI);

- // Optimize the special case where the source is a constant integer

- // induction variable. Notice that we can only optimize the 'trunc' case

- // because (a) FP conversions lose precision, (b) sext/zext may wrap, and

- // (c) other casts depend on pointer size.

- auto ID = Legal->getInductionVars()->lookup(OldInduction);

- if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&

- ID.getConstIntStepValue()) {

- widenIntInduction(OldInduction, cast<TruncInst>(CI));

- break;

- }

+ setDebugLocFromInst(Builder, &I);

- /// Vectorize casts.

- Type *DestTy =

- (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);

+ Module *M = I.getParent()->getParent()->getParent();

+ auto *CI = cast<CallInst>(&I);

- const VectorParts &A = getVectorValue(CI->getOperand(0));

- VectorParts Entry(UF);

- for (unsigned Part = 0; Part < UF; ++Part)

- Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);

- VectorLoopValueMap.initVector(&I, Entry);

- addMetadata(Entry, &I);

+ StringRef FnName = CI->getCalledFunction()->getName();

+ Function *F = CI->getCalledFunction();

+ Type *RetTy = ToVectorTy(CI->getType(), VF);

+ SmallVector<Type *, 4> Tys;

+ for (Value *ArgOperand : CI->arg_operands())

+ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||

+ ID == Intrinsic::lifetime_start)) {

+ scalarizeInstruction(&I);

+ break;

+ }

+ // The flag shows whether we use Intrinsic or a usual Call for vectorized

+ // version of the instruction.

+ // Is it beneficial to perform intrinsic call compared to lib call?

+ bool NeedToScalarize;

+ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);

+ bool UseVectorIntrinsic =

+ ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;

+ if (!UseVectorIntrinsic && NeedToScalarize) {

+ scalarizeInstruction(&I);

break;

}

- case Instruction::Call: {

- // Ignore dbg intrinsics.

- if (isa<DbgInfoIntrinsic>(I))

- break;

- setDebugLocFromInst(Builder, &I);

- Module *M = BB->getParent()->getParent();

- auto *CI = cast<CallInst>(&I);

- StringRef FnName = CI->getCalledFunction()->getName();

- Function *F = CI->getCalledFunction();

- Type *RetTy = ToVectorTy(CI->getType(), VF);

- SmallVector<Type *, 4> Tys;

- for (Value *ArgOperand : CI->arg_operands())

- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

- if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||

- ID == Intrinsic::lifetime_start)) {

- scalarizeInstruction(&I);

- break;

- }

- // The flag shows whether we use Intrinsic or a usual Call for vectorized

- // version of the instruction.

- // Is it beneficial to perform intrinsic call compared to lib call?

- bool NeedToScalarize;

- unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);

- bool UseVectorIntrinsic =

- ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;

- if (!UseVectorIntrinsic && NeedToScalarize) {

- scalarizeInstruction(&I);

- break;

- }

- VectorParts Entry(UF);

- for (unsigned Part = 0; Part < UF; ++Part) {

- SmallVector<Value *, 4> Args;

- for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {

- Value *Arg = CI->getArgOperand(i);

- // Some intrinsics have a scalar argument - don't replace it with a

- // vector.

- if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {

- const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));

- Arg = VectorArg[Part];

- }

- Args.push_back(Arg);

+ VectorParts Entry(UF);

+ for (unsigned Part = 0; Part < UF; ++Part) {

+ SmallVector<Value *, 4> Args;

+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {

+ Value *Arg = CI->getArgOperand(i);

+ // Some intrinsics have a scalar argument - don't replace it with a

+ // vector.

+ if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {

+ const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));

+ Arg = VectorArg[Part];

}

+ Args.push_back(Arg);

+ }

- Function *VectorF;

- if (UseVectorIntrinsic) {

- // Use vector version of the intrinsic.

- Type *TysForDecl[] = {CI->getType()};

- if (VF > 1)

- TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);

- VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);

- } else {

- // Use vector version of the library call.

- StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);

- assert(!VFnName.empty() && "Vector function name is empty.");

- VectorF = M->getFunction(VFnName);

- if (!VectorF) {

- // Generate a declaration

- FunctionType *FTy = FunctionType::get(RetTy, Tys, false);

- VectorF =

- Function::Create(FTy, Function::ExternalLinkage, VFnName, M);

- VectorF->copyAttributesFrom(F);

- }

+ Function *VectorF;

+ if (UseVectorIntrinsic) {

+ // Use vector version of the intrinsic.

+ Type *TysForDecl[] = {CI->getType()};

+ if (VF > 1)

+ TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);

+ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);

+ } else {

+ // Use vector version of the library call.

+ StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);

+ assert(!VFnName.empty() && "Vector function name is empty.");

+ VectorF = M->getFunction(VFnName);

+ if (!VectorF) {

+ // Generate a declaration

+ FunctionType *FTy = FunctionType::get(RetTy, Tys, false);

+ VectorF =

+ Function::Create(FTy, Function::ExternalLinkage, VFnName, M);

+ VectorF->copyAttributesFrom(F);

}

- assert(VectorF && "Can't create vector function.");

- SmallVector<OperandBundleDef, 1> OpBundles;

- CI->getOperandBundlesAsDefs(OpBundles);

- CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

+ }

+ assert(VectorF && "Can't create vector function.");

- if (isa<FPMathOperator>(V))

- V->copyFastMathFlags(CI);

+ SmallVector<OperandBundleDef, 1> OpBundles;

+ CI->getOperandBundlesAsDefs(OpBundles);

+ CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

- Entry[Part] = V;

- }

+ if (isa<FPMathOperator>(V))

+ V->copyFastMathFlags(CI);

- VectorLoopValueMap.initVector(&I, Entry);

- addMetadata(Entry, &I);

- break;

+ Entry[Part] = V;

}

- default:

- // All other instructions are unsupported. Scalarize them.

- scalarizeInstruction(&I);

- break;

- } // end of switch.

- } // end of for_each instr.

+ VectorLoopValueMap.initVector(&I, Entry);

+ addMetadata(Entry, &I);

+ break;

+ }

+ default:

+ // All other instructions are unsupported. Scalarize them.

+ scalarizeInstruction(&I);

+ break;

+ } // end of switch.

}

void InnerLoopVectorizer::updateAnalysis() {

@@ -4976,11 +5059,10 @@ void InnerLoopVectorizer::updateAnalysis() {

assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&

"Entry does not dominate exit.");

- // We don't predicate stores by this point, so the vector body should be a

- // single loop.

- DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);

- DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);

+ DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),

+ LoopVectorPreHeader);

+ DT->addNewBlock(LoopMiddleBlock,

+ LI->getLoopFor(LoopVectorBody)->getLoopLatch());

DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);

DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);

DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);

@@ -5145,12 +5227,6 @@ bool LoopVectorizationLegality::canVectorize() {

if (UseInterleaved)

InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());

- // Collect all instructions that are known to be uniform after vectorization.

- collectLoopUniforms();

- // Collect all instructions that are known to be scalar after vectorization.

- collectLoopScalars();

unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;

if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)

SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;

@@ -5234,8 +5310,8 @@ void LoopVectorizationLegality::addInductionPhi(

// one if there are multiple (no good reason for doing this other

// than it is expedient). We've checked that it begins at zero and

// steps by one, so this is a canonical induction variable.

- if (!Induction || PhiTy == WidestIndTy)

- Induction = Phi;

+ if (!PrimaryInduction || PhiTy == WidestIndTy)

+ PrimaryInduction = Phi;

}

// Both the PHI node itself, and the "post-increment" value feeding

@@ -5398,7 +5474,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

} // next instr.

}

- if (!Induction) {

+ if (!PrimaryInduction) {

DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");

if (Inductions.empty()) {

ORE->emit(createMissedAnalysis("NoInductionVariable")

@@ -5410,46 +5486,166 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

// Now we know the widest induction type, check if our found induction

// is the same size. If it's not, unset it here and InnerLoopVectorizer

// will create another.

- if (Induction && WidestIndTy != Induction->getType())

- Induction = nullptr;

+ if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())

+ PrimaryInduction = nullptr;

return true;

}

-void LoopVectorizationLegality::collectLoopScalars() {

+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {

+ // We should not collect Scalars more than once per VF. Right now, this

+ // function is called from collectUniformsAndScalars(), which already does

+ // this check. Collecting Scalars for VF=1 does not make any sense.

+ assert(VF >= 2 && !Scalars.count(VF) &&

+ "This function should not be visited twice for the same VF");

+ SmallSetVector<Instruction *, 8> Worklist;

+ // These sets are used to seed the analysis with pointers used by memory

+ // accesses that will remain scalar.

+ SmallSetVector<Instruction *, 8> ScalarPtrs;

+ SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;

+ // A helper that returns true if the use of Ptr by MemAccess will be scalar.

+ // The pointer operands of loads and stores will be scalar as long as the

+ // memory access is not a gather or scatter operation. The value operand of a

+ // store will remain scalar if the store is scalarized.

+ auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {

+ InstWidening WideningDecision = getWideningDecision(MemAccess, VF);

+ assert(WideningDecision != CM_Unknown &&

+ "Widening decision should be ready at this moment");

+ if (auto *Store = dyn_cast<StoreInst>(MemAccess))

+ if (Ptr == Store->getValueOperand())

+ return WideningDecision == CM_Scalarize;

+ assert(Ptr == getPointerOperand(MemAccess) &&

+ "Ptr is neither a value or pointer operand");

+ return WideningDecision != CM_GatherScatter;

+ };

+ // A helper that returns true if the given value is a bitcast or

+ // getelementptr instruction contained in the loop.

+ auto isLoopVaryingBitCastOrGEP = [&](Value *V) {

+ return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||

+ isa<GetElementPtrInst>(V)) &&

+ !TheLoop->isLoopInvariant(V);

+ };

- // If an instruction is uniform after vectorization, it will remain scalar.

- Scalars.insert(Uniforms.begin(), Uniforms.end());

+ // A helper that evaluates a memory access's use of a pointer. If the use

+ // will be a scalar use, and the pointer is only used by memory accesses, we

+ // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in

+ // PossibleNonScalarPtrs.

+ auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {

- // Collect the getelementptr instructions that will not be vectorized. A

- // getelementptr instruction is only vectorized if it is used for a legal

- // gather or scatter operation.

+ // We only care about bitcast and getelementptr instructions contained in

+ // the loop.

+ if (!isLoopVaryingBitCastOrGEP(Ptr))

+ return;

+ // If the pointer has already been identified as scalar (e.g., if it was

+ // also identified as uniform), there's nothing to do.

+ auto *I = cast<Instruction>(Ptr);

+ if (Worklist.count(I))

+ return;

+ // If the use of the pointer will be a scalar use, and all users of the

+ // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,

+ // place the pointer in PossibleNonScalarPtrs.

+ if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {

+ return isa<LoadInst>(U) || isa<StoreInst>(U);

+ }))

+ ScalarPtrs.insert(I);

+ else

+ PossibleNonScalarPtrs.insert(I);

+ };

+ // We seed the scalars analysis with three classes of instructions: (1)

+ // instructions marked uniform-after-vectorization, (2) bitcast and

+ // getelementptr instructions used by memory accesses requiring a scalar use,

+ // and (3) pointer induction variables and their update instructions (we

+ // currently only scalarize these).

+ //

+ // (1) Add to the worklist all instructions that have been identified as

+ // uniform-after-vectorization.

+ Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());

+ // (2) Add to the worklist all bitcast and getelementptr instructions used by

+ // memory accesses requiring a scalar use. The pointer operands of loads and

+ // stores will be scalar as long as the memory accesses is not a gather or

+ // scatter operation. The value operand of a store will remain scalar if the

+ // store is scalarized.

for (auto *BB : TheLoop->blocks())

for (auto &I : *BB) {

- if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

- Scalars.insert(GEP);

- continue;

+ if (auto *Load = dyn_cast<LoadInst>(&I)) {

+ evaluatePtrUse(Load, Load->getPointerOperand());

+ } else if (auto *Store = dyn_cast<StoreInst>(&I)) {

+ evaluatePtrUse(Store, Store->getPointerOperand());

+ evaluatePtrUse(Store, Store->getValueOperand());

}

- auto *Ptr = getPointerOperand(&I);

- if (!Ptr)

- continue;

- auto *GEP = getGEPInstruction(Ptr);

- if (GEP && isLegalGatherOrScatter(&I))

- Scalars.erase(GEP);

+ }

+ for (auto *I : ScalarPtrs)

+ if (!PossibleNonScalarPtrs.count(I)) {

+ DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");

+ Worklist.insert(I);

}

+ // (3) Add to the worklist all pointer induction variables and their update

+ // instructions.

+ //

+ // TODO: Once we are able to vectorize pointer induction variables we should

+ // no longer insert them into the worklist here.

+ auto *Latch = TheLoop->getLoopLatch();

+ for (auto &Induction : *Legal->getInductionVars()) {

+ auto *Ind = Induction.first;

+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

+ if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)

+ continue;

+ Worklist.insert(Ind);

+ Worklist.insert(IndUpdate);

+ DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");

+ DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");

+ }

+ // Expand the worklist by looking through any bitcasts and getelementptr

+ // instructions we've already identified as scalar. This is similar to the

+ // expansion step in collectLoopUniforms(); however, here we're only

+ // expanding to include additional bitcasts and getelementptr instructions.

+ unsigned Idx = 0;

+ while (Idx != Worklist.size()) {

+ Instruction *Dst = Worklist[Idx++];

+ if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))

+ continue;

+ auto *Src = cast<Instruction>(Dst->getOperand(0));

+ if (all_of(Src->users(), [&](User *U) -> bool {

+ auto *J = cast<Instruction>(U);

+ return !TheLoop->contains(J) || Worklist.count(J) ||

+ ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&

+ isScalarUse(J, Src));

+ })) {

+ Worklist.insert(Src);

+ DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");

+ }

// An induction variable will remain scalar if all users of the induction

// variable and induction variable update remain scalar.

- auto *Latch = TheLoop->getLoopLatch();

- for (auto &Induction : *getInductionVars()) {

+ for (auto &Induction : *Legal->getInductionVars()) {

auto *Ind = Induction.first;

auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

+ // We already considered pointer induction variables, so there's no reason

+ // to look at their users again.

+ //

+ // TODO: Once we are able to vectorize pointer induction variables we

+ // should no longer skip over them here.

+ if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)

+ continue;

// Determine if all users of the induction variable are scalar after

// vectorization.

auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {

auto *I = cast<Instruction>(U);

- return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);

+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);

});

if (!ScalarInd)

continue;

@@ -5458,23 +5654,19 @@ void LoopVectorizationLegality::collectLoopScalars() {

// scalar after vectorization.

auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {

auto *I = cast<Instruction>(U);

- return I == Ind || !TheLoop->contains(I) || Scalars.count(I);

+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I);

});

if (!ScalarIndUpdate)

continue;

// The induction variable and its update instruction will remain scalar.

- Scalars.insert(Ind);

- Scalars.insert(IndUpdate);

+ Worklist.insert(Ind);

+ Worklist.insert(IndUpdate);

+ DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");

+ DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");

}

-bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {

- if (isAccessInterleaved(I))

- return true;

- if (auto *Ptr = getPointerOperand(I))

- return isConsecutivePtr(Ptr);

- return false;

+ Scalars[VF].insert(Worklist.begin(), Worklist.end());

}

bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {

@@ -5494,48 +5686,48 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {

return false;

}

-bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(

- Instruction *I, unsigned VF) {

- // If the memory instruction is in an interleaved group, it will be

- // vectorized and its pointer will remain uniform.

- if (isAccessInterleaved(I))

- return false;

+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,

+ unsigned VF) {

// Get and ensure we have a valid memory instruction.

LoadInst *LI = dyn_cast<LoadInst>(I);

StoreInst *SI = dyn_cast<StoreInst>(I);

assert((LI || SI) && "Invalid memory instruction");

- // If the pointer operand is uniform (loop invariant), the memory instruction

- // will be scalarized.

auto *Ptr = getPointerOperand(I);

- if (LI && isUniform(Ptr))

- return true;

- // If the pointer operand is non-consecutive and neither a gather nor a

- // scatter operation is legal, the memory instruction will be scalarized.

- if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))

- return true;

+ // In order to be widened, the pointer should be consecutive, first of all.

+ if (!isConsecutivePtr(Ptr))

+ return false;

// If the instruction is a store located in a predicated block, it will be

// scalarized.

if (isScalarWithPredication(I))

- return true;

+ return false;

// If the instruction's allocated size doesn't equal it's type size, it

// requires padding and will be scalarized.

auto &DL = I->getModule()->getDataLayout();

auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();

if (hasIrregularType(ScalarTy, DL, VF))

- return true;

+ return false;

- // Otherwise, the memory instruction should be vectorized if the rest of the

- // loop is.

- return false;

+ return true;

}

-void LoopVectorizationLegality::collectLoopUniforms() {

+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {

+ // We should not collect Uniforms more than once per VF. Right now,

+ // this function is called from collectUniformsAndScalars(), which

+ // already does this check. Collecting Uniforms for VF=1 does not make any

+ // sense.

+ assert(VF >= 2 && !Uniforms.count(VF) &&

+ "This function should not be visited twice for the same VF");

+ // Visit the list of Uniforms. If we'll not find any uniform value, we'll

+ // not analyze again. Uniforms.count(VF) will return 1.

+ Uniforms[VF].clear();

// We now know that the loop is vectorizable!

// Collect instructions inside the loop that will remain uniform after

// vectorization.

@@ -5568,6 +5760,14 @@ void LoopVectorizationLegality::collectLoopUniforms() {

// Holds pointer operands of instructions that are possibly non-uniform.

SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;

+ auto isUniformDecision = [&](Instruction *I, unsigned VF) {

+ InstWidening WideningDecision = getWideningDecision(I, VF);

+ assert(WideningDecision != CM_Unknown &&

+ "Widening decision should be ready at this moment");

+ return (WideningDecision == CM_Widen ||

+ WideningDecision == CM_Interleave);

+ };

// Iterate over the instructions in the loop, and collect all

// consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible

// that a consecutive-like pointer operand will be scalarized, we collect it

@@ -5590,25 +5790,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {

return getPointerOperand(U) == Ptr;

});

- // Ensure the memory instruction will not be scalarized, making its

- // pointer operand non-uniform. If the pointer operand is used by some

- // instruction other than a memory access, we're not going to check if

- // that other instruction may be scalarized here. Thus, conservatively

- // assume the pointer operand may be non-uniform.

- if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))

+ // Ensure the memory instruction will not be scalarized or used by

+ // gather/scatter, making its pointer operand non-uniform. If the pointer

+ // operand is used by any instruction other than a memory access, we

+ // conservatively assume the pointer operand may be non-uniform.

+ if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))

PossibleNonUniformPtrs.insert(Ptr);

// If the memory instruction will be vectorized and its pointer operand

- // is consecutive-like, the pointer operand should remain uniform.

- else if (hasConsecutiveLikePtrOperand(&I))

- ConsecutiveLikePtrs.insert(Ptr);

- // Otherwise, if the memory instruction will be vectorized and its

- // pointer operand is non-consecutive-like, the memory instruction should

- // be a gather or scatter operation. Its pointer operand will be

- // non-uniform.

+ // is consecutive-like, or interleaving - the pointer operand should

+ // remain uniform.

else

- PossibleNonUniformPtrs.insert(Ptr);

+ ConsecutiveLikePtrs.insert(Ptr);

}

// Add to the Worklist all consecutive and consecutive-like pointers that

@@ -5632,7 +5825,9 @@ void LoopVectorizationLegality::collectLoopUniforms() {

continue;

auto *OI = cast<Instruction>(OV);

if (all_of(OI->users(), [&](User *U) -> bool {

- return isOutOfScope(U) || Worklist.count(cast<Instruction>(U));

+ auto *J = cast<Instruction>(U);

+ return !TheLoop->contains(J) || Worklist.count(J) ||

+ (OI == getPointerOperand(J) && isUniformDecision(J, VF));

})) {

Worklist.insert(OI);

DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");

@@ -5643,7 +5838,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {

// Returns true if Ptr is the pointer operand of a memory access instruction

// I, and I is known to not require scalarization.

auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {

- return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);

+ return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);

};

// For an instruction to be added into Worklist above, all its users inside

@@ -5652,7 +5847,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {

// nodes separately. An induction variable will remain uniform if all users

// of the induction variable and induction variable update remain uniform.

// The code below handles both pointer and non-pointer induction variables.

- for (auto &Induction : Inductions) {

+ for (auto &Induction : *Legal->getInductionVars()) {

auto *Ind = Induction.first;

auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

@@ -5683,7 +5878,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {

DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");

}

- Uniforms.insert(Worklist.begin(), Worklist.end());

+ Uniforms[VF].insert(Worklist.begin(), Worklist.end());

}

bool LoopVectorizationLegality::canVectorizeMemory() {

@@ -5823,7 +6018,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(

uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());

// An alignment of 0 means target ABI alignment.

- unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();

+ unsigned Align = getMemInstAlignment(&I);

if (!Align)

Align = DL.getABITypeAlignment(PtrTy->getElementType());

@@ -5978,6 +6173,11 @@ void InterleavedAccessInfo::analyzeInterleaving(

if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)

continue;

+ // Ignore A if the memory object of A and B don't belong to the same

+ // address space

+ if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))

+ continue;

// Calculate the distance from A to B.

const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(

PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));

@@ -6020,36 +6220,36 @@ void InterleavedAccessInfo::analyzeInterleaving(

if (Group->getNumMembers() != Group->getFactor())

releaseGroup(Group);

- // Remove interleaved groups with gaps (currently only loads) whose memory

- // accesses may wrap around. We have to revisit the getPtrStride analysis,

- // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does

+ // Remove interleaved groups with gaps (currently only loads) whose memory

+ // accesses may wrap around. We have to revisit the getPtrStride analysis,

+ // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does

// not check wrapping (see documentation there).

- // FORNOW we use Assume=false;

- // TODO: Change to Assume=true but making sure we don't exceed the threshold

+ // FORNOW we use Assume=false;

+ // TODO: Change to Assume=true but making sure we don't exceed the threshold

// of runtime SCEV assumptions checks (thereby potentially failing to

- // vectorize altogether).

+ // vectorize altogether).

// Additional optional optimizations:

- // TODO: If we are peeling the loop and we know that the first pointer doesn't

+ // TODO: If we are peeling the loop and we know that the first pointer doesn't

// wrap then we can deduce that all pointers in the group don't wrap.

- // This means that we can forcefully peel the loop in order to only have to

- // check the first pointer for no-wrap. When we'll change to use Assume=true

+ // This means that we can forcefully peel the loop in order to only have to

+ // check the first pointer for no-wrap. When we'll change to use Assume=true

// we'll only need at most one runtime check per interleaved group.

for (InterleaveGroup *Group : LoadGroups) {

// Case 1: A full group. Can Skip the checks; For full groups, if the wide

- // load would wrap around the address space we would do a memory access at

- // nullptr even without the transformation.

- if (Group->getNumMembers() == Group->getFactor())

+ // load would wrap around the address space we would do a memory access at

+ // nullptr even without the transformation.

+ if (Group->getNumMembers() == Group->getFactor())

continue;

- // Case 2: If first and last members of the group don't wrap this implies

+ // Case 2: If first and last members of the group don't wrap this implies

// that all the pointers in the group don't wrap.

// So we check only group member 0 (which is always guaranteed to exist),

- // and group member Factor - 1; If the latter doesn't exist we rely on

+ // and group member Factor - 1; If the latter doesn't exist we rely on

// peeling (if it is a non-reveresed accsess -- see Case 3).

Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));

- if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,

+ if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,

/*ShouldCheckWrap=*/true)) {

DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "

"first group member potentially pointer-wrapping.\n");

@@ -6065,8 +6265,7 @@ void InterleavedAccessInfo::analyzeInterleaving(

"last group member potentially pointer-wrapping.\n");

releaseGroup(Group);

}

- }

- else {

+ } else {

// Case 3: A non-reversed interleaved load group with gaps: We need

// to execute at least one scalar epilogue iteration. This will ensure

// we don't speculatively access memory out-of-bounds. We only need

@@ -6082,27 +6281,62 @@ void InterleavedAccessInfo::analyzeInterleaving(

}

-LoopVectorizationCostModel::VectorizationFactor

-LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {

- // Width 1 means no vectorize

- VectorizationFactor Factor = {1U, 0U};

- if (OptForSize && Legal->getRuntimePointerChecking()->Need) {

+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {

+ if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {

+ ORE->emit(createMissedAnalysis("ConditionalStore")

+ << "store that is conditionally executed prevents vectorization");

+ DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");

+ return None;

+ }

+ if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.

+ return computeFeasibleMaxVF(OptForSize);

+ if (Legal->getRuntimePointerChecking()->Need) {

ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")

<< "runtime pointer checks needed. Enable vectorization of this "

"loop with '#pragma clang loop vectorize(enable)' when "

"compiling with -Os/-Oz");

DEBUG(dbgs()

<< "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");

- return Factor;

+ return None;

}

- if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {

- ORE->emit(createMissedAnalysis("ConditionalStore")

- << "store that is conditionally executed prevents vectorization");

- DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");

- return Factor;

+ // If we optimize the program for size, avoid creating the tail loop.

+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

+ DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

+ // If we don't know the precise trip count, don't try to vectorize.

+ if (TC < 2) {

+ ORE->emit(

+ createMissedAnalysis("UnknownLoopCountComplexCFG")

+ << "unable to calculate the loop count due to complex control flow");

+ DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");

+ return None;

}

+ unsigned MaxVF = computeFeasibleMaxVF(OptForSize);

+ if (TC % MaxVF != 0) {

+ // If the trip count that we found modulo the vectorization factor is not

+ // zero then we require a tail.

+ // FIXME: look for a smaller MaxVF that does divide TC rather than give up.

+ // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a

+ // smaller MaxVF that does not require a scalar epilog.

+ ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")

+ << "cannot optimize for size and vectorize at the "

+ "same time. Enable vectorization of this loop "

+ "with '#pragma clang loop vectorize(enable)' "

+ "when compiling with -Os/-Oz");

+ DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");

+ return None;

+ }

+ return MaxVF;

+unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {

MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);

unsigned SmallestType, WidestType;

std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();

@@ -6136,7 +6370,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {

assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"

" into one vector!");

- unsigned VF = MaxVectorSize;

+ unsigned MaxVF = MaxVectorSize;

if (MaximizeBandwidth && !OptForSize) {

// Collect all viable vectorization factors.

SmallVector<unsigned, 8> VFs;

@@ -6152,54 +6386,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {

unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);

for (int i = RUs.size() - 1; i >= 0; --i) {

if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {

- VF = VFs[i];

+ MaxVF = VFs[i];

break;

}

+ return MaxVF;

- // If we optimize the program for size, avoid creating the tail loop.

- if (OptForSize) {

- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);

- DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

- // If we don't know the precise trip count, don't try to vectorize.

- if (TC < 2) {

- ORE->emit(

- createMissedAnalysis("UnknownLoopCountComplexCFG")

- << "unable to calculate the loop count due to complex control flow");

- DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");

- return Factor;

- }

- // Find the maximum SIMD width that can fit within the trip count.

- VF = TC % MaxVectorSize;

- if (VF == 0)

- VF = MaxVectorSize;

- else {

- // If the trip count that we found modulo the vectorization factor is not

- // zero then we require a tail.

- ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")

- << "cannot optimize for size and vectorize at the "

- "same time. Enable vectorization of this loop "

- "with '#pragma clang loop vectorize(enable)' "

- "when compiling with -Os/-Oz");

- DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");

- return Factor;

- }

- int UserVF = Hints->getWidth();

- if (UserVF != 0) {

- assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");

- DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");

- Factor.Width = UserVF;

- collectInstsToScalarize(UserVF);

- return Factor;

- }

+LoopVectorizationCostModel::VectorizationFactor

+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {

float Cost = expectedCost(1).first;

#ifndef NDEBUG

const float ScalarCost = Cost;

@@ -6209,12 +6405,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {

bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;

// Ignore scalar width, because the user explicitly wants vectorization.

- if (ForceVectorization && VF > 1) {

+ if (ForceVectorization && MaxVF > 1) {

Width = 2;

Cost = expectedCost(Width).first / (float)Width;

}

- for (unsigned i = 2; i <= VF; i *= 2) {

+ for (unsigned i = 2; i <= MaxVF; i *= 2) {

// Notice that the vector loop needs to be executed less times, so

// we need to divide the cost of the vector loops by the width of

// the vector elements.

@@ -6238,8 +6434,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {

<< "LV: Vectorization seems to be not beneficial, "

<< "but was forced by a user.\n");

DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");

- Factor.Width = Width;

- Factor.Cost = Width * Cost;

+ VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};

return Factor;

}

@@ -6277,9 +6472,16 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {

T = ST->getValueOperand()->getType();

// Ignore loaded pointer types and stored pointer types that are not

- // consecutive. However, we do want to take consecutive stores/loads of

- // pointer vectors into account.

- if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I))

+ // vectorizable.

+ //

+ // FIXME: The check here attempts to predict whether a load or store will

+ // be vectorized. We only know this for certain after a VF has

+ // been selected. Here, we assume that if an access can be

+ // vectorized, it will be. We should also look at extending this

+ // optimization to non-pointer types.

+ //

+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&

+ !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))

continue;

MinWidth = std::min(MinWidth,

@@ -6562,12 +6764,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {

MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());

continue;

}

+ collectUniformsAndScalars(VFs[j]);

// Count the number of live intervals.

unsigned RegUsage = 0;

for (auto Inst : OpenIntervals) {

// Skip ignored values for VF > 1.

- if (VecValuesToIgnore.count(Inst))

+ if (VecValuesToIgnore.count(Inst) ||

+ isScalarAfterVectorization(Inst, VFs[j]))

continue;

RegUsage += GetRegUsage(Inst->getType(), VFs[j]);

}

@@ -6628,6 +6831,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {

ScalarCostsTy ScalarCosts;

if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)

ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());

+ // Remember that BB will remain after vectorization.

+ PredicatedBBsAfterVectorization.insert(BB);

}

@@ -6636,7 +6842,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(

Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,

unsigned VF) {

- assert(!Legal->isUniformAfterVectorization(PredInst) &&

+ assert(!isUniformAfterVectorization(PredInst, VF) &&

"Instruction marked uniform-after-vectorization will be predicated");

// Initialize the discount to zero, meaning that the scalar version and the

@@ -6657,7 +6863,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(

// already be scalar to avoid traversing chains that are unlikely to be

// beneficial.

if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||

- Legal->isScalarAfterVectorization(I))

+ isScalarAfterVectorization(I, VF))

return false;

// If the instruction is scalar with predication, it will be analyzed

@@ -6677,7 +6883,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(

// the lane zero values for uniforms rather than asserting.

for (Use &U : I->operands())

if (auto *J = dyn_cast<Instruction>(U.get()))

- if (Legal->isUniformAfterVectorization(J))

+ if (isUniformAfterVectorization(J, VF))

return false;

// Otherwise, we can scalarize the instruction.

@@ -6690,7 +6896,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(

// and their return values are inserted into vectors. Thus, an extract would

// still be required.

auto needsExtract = [&](Instruction *I) -> bool {

- return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);

+ return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);

};

// Compute the expected cost discount from scalarizing the entire expression

@@ -6717,8 +6923,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(

// Compute the scalarization overhead of needed insertelement instructions

// and phi nodes.

if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {

- ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,

- false, TTI);

+ ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),

+ true, false);

ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);

}

@@ -6733,8 +6939,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(

if (canBeScalarized(J))

Worklist.push_back(J);

else if (needsExtract(J))

- ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),

- false, true, TTI);

+ ScalarCost += TTI.getScalarizationOverhead(

+ ToVectorTy(J->getType(),VF), false, true);

}

// Scale the total scalar cost by block probability.

@@ -6753,6 +6959,9 @@ LoopVectorizationCostModel::VectorizationCostTy

LoopVectorizationCostModel::expectedCost(unsigned VF) {

VectorizationCostTy Cost;

+ // Collect Uniform and Scalar instructions after vectorization with VF.

+ collectUniformsAndScalars(VF);

// Collect the instructions (and their associated costs) that will be more

// profitable to scalarize.

collectInstsToScalarize(VF);

@@ -6832,11 +7041,141 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {

Legal->hasStride(I->getOperand(1));

}

+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

+ unsigned VF) {

+ Type *ValTy = getMemInstValueType(I);

+ auto SE = PSE.getSE();

+ unsigned Alignment = getMemInstAlignment(I);

+ unsigned AS = getMemInstAddressSpace(I);

+ Value *Ptr = getPointerOperand(I);

+ Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

+ // Figure out whether the access is strided and get the stride value

+ // if it's known in compile time

+ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);

+ // Get the cost of the scalar memory instruction and address computation.

+ unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

+ Cost += VF *

+ TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,

+ AS, I);

+ // Get the overhead of the extractelement and insertelement instructions

+ // we might create due to scalarization.

+ Cost += getScalarizationOverhead(I, VF, TTI);

+ // If we have a predicated store, it may not be executed for each vector

+ // lane. Scale the cost by the probability of executing the predicated

+ // block.

+ if (Legal->isScalarWithPredication(I))

+ Cost /= getReciprocalPredBlockProb();

+ return Cost;

+unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

+ unsigned VF) {

+ Type *ValTy = getMemInstValueType(I);

+ Type *VectorTy = ToVectorTy(ValTy, VF);

+ unsigned Alignment = getMemInstAlignment(I);

+ Value *Ptr = getPointerOperand(I);

+ unsigned AS = getMemInstAddressSpace(I);

+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&

+ "Stride should be 1 or -1 for consecutive memory access");

+ unsigned Cost = 0;

+ if (Legal->isMaskRequired(I))

+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

+ else

+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

+ bool Reverse = ConsecutiveStride < 0;

+ if (Reverse)

+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);

+ return Cost;

+unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

+ unsigned VF) {

+ LoadInst *LI = cast<LoadInst>(I);

+ Type *ValTy = LI->getType();

+ Type *VectorTy = ToVectorTy(ValTy, VF);

+ unsigned Alignment = LI->getAlignment();

+ unsigned AS = LI->getPointerAddressSpace();

+ return TTI.getAddressComputationCost(ValTy) +

+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +

+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);

+unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,

+ unsigned VF) {

+ Type *ValTy = getMemInstValueType(I);

+ Type *VectorTy = ToVectorTy(ValTy, VF);

+ unsigned Alignment = getMemInstAlignment(I);

+ Value *Ptr = getPointerOperand(I);

+ return TTI.getAddressComputationCost(VectorTy) +

+ TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,

+ Legal->isMaskRequired(I), Alignment);

+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

+ unsigned VF) {

+ Type *ValTy = getMemInstValueType(I);

+ Type *VectorTy = ToVectorTy(ValTy, VF);

+ unsigned AS = getMemInstAddressSpace(I);

+ auto Group = Legal->getInterleavedAccessGroup(I);

+ assert(Group && "Fail to get an interleaved access group.");

+ unsigned InterleaveFactor = Group->getFactor();

+ Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);

+ // Holds the indices of existing members in an interleaved load group.

+ // An interleaved store group doesn't need this as it doesn't allow gaps.

+ SmallVector<unsigned, 4> Indices;

+ if (isa<LoadInst>(I)) {

+ for (unsigned i = 0; i < InterleaveFactor; i++)

+ if (Group->getMember(i))

+ Indices.push_back(i);

+ }

+ // Calculate the cost of the whole interleaved group.

+ unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,

+ Group->getFactor(), Indices,

+ Group->getAlignment(), AS);

+ if (Group->isReverse())

+ Cost += Group->getNumMembers() *

+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);

+ return Cost;

+unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

+ unsigned VF) {

+ // Calculate scalar cost only. Vectorization cost should be ready at this

+ // moment.

+ if (VF == 1) {

+ Type *ValTy = getMemInstValueType(I);

+ unsigned Alignment = getMemInstAlignment(I);

+ unsigned AS = getMemInstAlignment(I);

+ return TTI.getAddressComputationCost(ValTy) +

+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);

+ }

+ return getWideningCost(I, VF);

LoopVectorizationCostModel::VectorizationCostTy

LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {

// If we know that this instruction will remain uniform, check the cost of

// the scalar version.

- if (Legal->isUniformAfterVectorization(I))

+ if (isUniformAfterVectorization(I, VF))

VF = 1;

if (VF > 1 && isProfitableToScalarize(I, VF))

@@ -6850,6 +7189,79 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {

return VectorizationCostTy(C, TypeNotScalarized);

}

+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {

+ if (VF == 1)

+ return;

+ for (BasicBlock *BB : TheLoop->blocks()) {

+ // For each instruction in the old loop.

+ for (Instruction &I : *BB) {

+ Value *Ptr = getPointerOperand(&I);

+ if (!Ptr)

+ continue;

+ if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {

+ // Scalar load + broadcast

+ unsigned Cost = getUniformMemOpCost(&I, VF);

+ setWideningDecision(&I, VF, CM_Scalarize, Cost);

+ continue;

+ }

+ // We assume that widening is the best solution when possible.

+ if (Legal->memoryInstructionCanBeWidened(&I, VF)) {

+ unsigned Cost = getConsecutiveMemOpCost(&I, VF);

+ setWideningDecision(&I, VF, CM_Widen, Cost);

+ continue;

+ }

+ // Choose between Interleaving, Gather/Scatter or Scalarization.

+ unsigned InterleaveCost = UINT_MAX;

+ unsigned NumAccesses = 1;

+ if (Legal->isAccessInterleaved(&I)) {

+ auto Group = Legal->getInterleavedAccessGroup(&I);

+ assert(Group && "Fail to get an interleaved access group.");

+ // Make one decision for the whole group.

+ if (getWideningDecision(&I, VF) != CM_Unknown)

+ continue;

+ NumAccesses = Group->getNumMembers();

+ InterleaveCost = getInterleaveGroupCost(&I, VF);

+ }

+ unsigned GatherScatterCost =

+ Legal->isLegalGatherOrScatter(&I)

+ ? getGatherScatterCost(&I, VF) * NumAccesses

+ : UINT_MAX;

+ unsigned ScalarizationCost =

+ getMemInstScalarizationCost(&I, VF) * NumAccesses;

+ // Choose better solution for the current VF,

+ // write down this decision and use it during vectorization.

+ unsigned Cost;

+ InstWidening Decision;

+ if (InterleaveCost <= GatherScatterCost &&

+ InterleaveCost < ScalarizationCost) {

+ Decision = CM_Interleave;

+ Cost = InterleaveCost;

+ } else if (GatherScatterCost < ScalarizationCost) {

+ Decision = CM_GatherScatter;

+ Cost = GatherScatterCost;

+ } else {

+ Decision = CM_Scalarize;

+ Cost = ScalarizationCost;

+ }

+ // If the instructions belongs to an interleave group, the whole group

+ // receives the same decision. The whole group receives the cost, but

+ // the cost will actually be assigned to one instruction.

+ if (auto Group = Legal->getInterleavedAccessGroup(&I))

+ setWideningDecision(Group, VF, Decision, Cost);

+ else

+ setWideningDecision(&I, VF, Decision, Cost);

+ }

unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

unsigned VF,

Type *&VectorTy) {

@@ -6868,7 +7280,31 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

// instruction cost.

return 0;

case Instruction::Br: {

- return TTI.getCFInstrCost(I->getOpcode());

+ // In cases of scalarized and predicated instructions, there will be VF

+ // predicated blocks in the vectorized loop. Each branch around these

+ // blocks requires also an extract of its vector compare i1 element.

+ bool ScalarPredicatedBB = false;

+ BranchInst *BI = cast<BranchInst>(I);

+ if (VF > 1 && BI->isConditional() &&

+ (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||

+ PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))

+ ScalarPredicatedBB = true;

+ if (ScalarPredicatedBB) {

+ // Return cost for branches around scalarized and predicated blocks.

+ Type *Vec_i1Ty =

+ VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);

+ return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +

+ (TTI.getCFInstrCost(Instruction::Br) * VF));

+ } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)

+ // The back-edge branch will remain, as will all scalar branches.

+ return TTI.getCFInstrCost(Instruction::Br);

+ else

+ // This branch will be eliminated by if-conversion.

+ return 0;

+ // Note: We currently assume zero cost for an unconditional branch inside

+ // a predicated block since it will become a fall-through, although we

+ // may decide in the future to call TTI for all branches.

}

case Instruction::PHI: {

auto *Phi = cast<PHINode>(I);

@@ -6969,7 +7405,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

if (!ScalarCond)

CondTy = VectorType::get(CondTy, VF);

- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);

+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);

}

case Instruction::ICmp:

case Instruction::FCmp: {

@@ -6978,130 +7414,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))

ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);

VectorTy = ToVectorTy(ValTy, VF);

- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);

+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);

}

case Instruction::Store:

case Instruction::Load: {

- StoreInst *SI = dyn_cast<StoreInst>(I);

- LoadInst *LI = dyn_cast<LoadInst>(I);

- Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());

- VectorTy = ToVectorTy(ValTy, VF);

- unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();

- unsigned AS =

- SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();

- Value *Ptr = getPointerOperand(I);

- // We add the cost of address computation here instead of with the gep

- // instruction because only here we know whether the operation is

- // scalarized.

- if (VF == 1)

- return TTI.getAddressComputationCost(VectorTy) +

- TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

- if (LI && Legal->isUniform(Ptr)) {

- // Scalar load + broadcast

- unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());

- Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),

- Alignment, AS);

- return Cost +

- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);

- }

- // For an interleaved access, calculate the total cost of the whole

- // interleave group.

- if (Legal->isAccessInterleaved(I)) {

- auto Group = Legal->getInterleavedAccessGroup(I);

- assert(Group && "Fail to get an interleaved access group.");

- // Only calculate the cost once at the insert position.

- if (Group->getInsertPos() != I)

- return 0;

- unsigned InterleaveFactor = Group->getFactor();

- Type *WideVecTy =

- VectorType::get(VectorTy->getVectorElementType(),

- VectorTy->getVectorNumElements() * InterleaveFactor);

- // Holds the indices of existing members in an interleaved load group.

- // An interleaved store group doesn't need this as it doesn't allow gaps.

- SmallVector<unsigned, 4> Indices;

- if (LI) {

- for (unsigned i = 0; i < InterleaveFactor; i++)

- if (Group->getMember(i))

- Indices.push_back(i);

- }

- // Calculate the cost of the whole interleaved group.

- unsigned Cost = TTI.getInterleavedMemoryOpCost(

- I->getOpcode(), WideVecTy, Group->getFactor(), Indices,

- Group->getAlignment(), AS);

- if (Group->isReverse())

- Cost +=

- Group->getNumMembers() *

- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);

- // FIXME: The interleaved load group with a huge gap could be even more

- // expensive than scalar operations. Then we could ignore such group and

- // use scalar operations instead.

- return Cost;

- }

- // Check if the memory instruction will be scalarized.

- if (Legal->memoryInstructionMustBeScalarized(I, VF)) {

- unsigned Cost = 0;

- Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

- // Figure out whether the access is strided and get the stride value

- // if it's known in compile time

- const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);

- // Get the cost of the scalar memory instruction and address computation.

- Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

- Cost += VF *

- TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),

- Alignment, AS);

- // Get the overhead of the extractelement and insertelement instructions

- // we might create due to scalarization.

- Cost += getScalarizationOverhead(I, VF, TTI);

- // If we have a predicated store, it may not be executed for each vector

- // lane. Scale the cost by the probability of executing the predicated

- // block.

- if (Legal->isScalarWithPredication(I))

- Cost /= getReciprocalPredBlockProb();

- return Cost;

- }

- // Determine if the pointer operand of the access is either consecutive or

- // reverse consecutive.

- int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

- bool Reverse = ConsecutiveStride < 0;

- // Determine if either a gather or scatter operation is legal.

- bool UseGatherOrScatter =

- !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);

- unsigned Cost = TTI.getAddressComputationCost(VectorTy);

- if (UseGatherOrScatter) {

- assert(ConsecutiveStride == 0 &&

- "Gather/Scatter are not used for consecutive stride");

- return Cost +

- TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,

- Legal->isMaskRequired(I), Alignment);

- }

- // Wide load/stores.

- if (Legal->isMaskRequired(I))

- Cost +=

- TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

- else

- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

- if (Reverse)

- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);

- return Cost;

+ VectorTy = ToVectorTy(getMemInstValueType(I), VF);

+ return getMemoryInstructionCost(I, VF);

}

case Instruction::ZExt:

case Instruction::SExt:

@@ -7115,12 +7433,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

case Instruction::Trunc:

case Instruction::FPTrunc:

case Instruction::BitCast: {

- // We optimize the truncation of induction variable.

- // The cost of these is the same as the scalar operation.

- if (I->getOpcode() == Instruction::Trunc &&

- Legal->isInductionVariable(I->getOperand(0)))

- return TTI.getCastInstrCost(I->getOpcode(), I->getType(),

- I->getOperand(0)->getType());

+ // We optimize the truncation of induction variables having constant

+ // integer steps. The cost of these truncations is the same as the scalar

+ // operation.

+ if (isOptimizableIVTruncate(I, VF)) {

+ auto *Trunc = cast<TruncInst>(I);

+ return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),

+ Trunc->getSrcTy(), Trunc);

+ }

Type *SrcScalarTy = I->getOperand(0)->getType();

Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);

@@ -7143,7 +7463,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,

}

- return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);

+ return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);

}

case Instruction::Call: {

bool NeedToScalarize;

@@ -7172,9 +7492,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)

INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)

INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)

INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)

-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)

INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)

INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)

INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)

@@ -7206,81 +7524,34 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {

SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();

VecValuesToIgnore.insert(Casts.begin(), Casts.end());

}

- // Insert values known to be scalar into VecValuesToIgnore. This is a

- // conservative estimation of the values that will later be scalarized.

- //

- // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may

- // still be scalarized. For example, we may find an instruction to be

- // more profitable for a given vectorization factor if it were to be

- // scalarized. But at this point, we haven't yet computed the

- // vectorization factor.

- for (auto *BB : TheLoop->getBlocks())

- for (auto &I : *BB)

- if (Legal->isScalarAfterVectorization(&I))

- VecValuesToIgnore.insert(&I);

}

-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,

- bool IfPredicateInstr) {

- assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");

- // Holds vector parameters or scalars, in case of uniform vals.

- SmallVector<VectorParts, 4> Params;

- setDebugLocFromInst(Builder, Instr);

- // Does this instruction return a value ?

- bool IsVoidRetTy = Instr->getType()->isVoidTy();

- // Initialize a new scalar map entry.

- ScalarParts Entry(UF);

- VectorParts Cond;

- if (IfPredicateInstr)

- Cond = createBlockInMask(Instr->getParent());

- // For each vector unroll 'part':

- for (unsigned Part = 0; Part < UF; ++Part) {

- Entry[Part].resize(1);

- // For each scalar that we create:

- // Start an "if (pred) a[i] = ..." block.

- Value *Cmp = nullptr;

- if (IfPredicateInstr) {

- if (Cond[Part]->getType()->isVectorTy())

- Cond[Part] =

- Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));

- Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],

- ConstantInt::get(Cond[Part]->getType(), 1));

- }

- Instruction *Cloned = Instr->clone();

- if (!IsVoidRetTy)

- Cloned->setName(Instr->getName() + ".cloned");

- // Replace the operands of the cloned instructions with their scalar

- // equivalents in the new loop.

- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {

- auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0);

- Cloned->setOperand(op, NewOp);

- }

+LoopVectorizationCostModel::VectorizationFactor

+LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {

- // Place the cloned scalar in the new loop.

- Builder.Insert(Cloned);

+ // Width 1 means no vectorize, cost 0 means uncomputed cost.

+ const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,

+ 0U};

+ Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);

+ if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.

+ return NoVectorization;

- // Add the cloned scalar to the scalar map entry.

- Entry[Part][0] = Cloned;

+ if (UserVF) {

+ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");

+ assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");

+ // Collect the instructions (and their associated costs) that will be more

+ // profitable to scalarize.

+ CM.selectUserVectorizationFactor(UserVF);

+ return {UserVF, 0};

+ }

- // If we just cloned a new assumption, add it the assumption cache.

- if (auto *II = dyn_cast<IntrinsicInst>(Cloned))

- if (II->getIntrinsicID() == Intrinsic::assume)

- AC->registerAssumption(II);

+ unsigned MaxVF = MaybeMaxVF.getValue();

+ assert(MaxVF != 0 && "MaxVF is zero.");

+ if (MaxVF == 1)

+ return NoVectorization;

- // End if-block.

- if (IfPredicateInstr)

- PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));

- }

- VectorLoopValueMap.initScalar(Instr, Entry);

+ // Select the optimal vectorization factor.

+ return CM.selectVectorizationFactor(MaxVF);

}

void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {

@@ -7414,11 +7685,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {

return false;

}

- // Use the cost model.

- LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,

- &Hints);

- CM.collectValuesToIgnore();

// Check the function attributes to find out if this function should be

// optimized for size.

bool OptForSize =

@@ -7464,9 +7730,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {

return false;

}

- // Select the optimal vectorization factor.

- const LoopVectorizationCostModel::VectorizationFactor VF =

- CM.selectVectorizationFactor(OptForSize);

+ // Use the cost model.

+ LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,

+ &Hints);

+ CM.collectValuesToIgnore();

+ // Use the planner for vectorization.

+ LoopVectorizationPlanner LVP(CM);

+ // Get user vectorization factor.

+ unsigned UserVF = Hints.getWidth();

+ // Plan how to best vectorize, return the best VF and its cost.

+ LoopVectorizationCostModel::VectorizationFactor VF =

+ LVP.plan(OptForSize, UserVF);

// Select the interleave count.

unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);

@@ -7522,10 +7799,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

const char *VAPassName = Hints.vectorizeAnalysisPassName();

if (!VectorizeLoop && !InterleaveLoop) {

// Do not vectorize or interleaving the loop.

- ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,

+ ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,

L->getStartLoc(), L->getHeader())

<< VecDiagMsg.second);

- ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,

+ ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,

L->getStartLoc(), L->getHeader())

<< IntDiagMsg.second);

return false;

@@ -7621,6 +7898,16 @@ bool LoopVectorizePass::runImpl(

if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)

return false;

+ bool Changed = false;

+ // The vectorizer requires loops to be in simplified form.

+ // Since simplification may add new inner loops, it has to run before the

+ // legality and profitability checks. This means running the loop vectorizer

+ // will simplify all loops, regardless of whether anything end up being

+ // vectorized.

+ for (auto &L : *LI)

+ Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);

// Build up a worklist of inner-loops to vectorize. This is necessary as

// the act of vectorizing or partially unrolling a loop creates new loops

// and can invalidate iterators across the loops.

@@ -7632,9 +7919,15 @@ bool LoopVectorizePass::runImpl(

LoopsAnalyzed += Worklist.size();

// Now walk the identified inner loops.

- bool Changed = false;

- while (!Worklist.empty())

- Changed |= processLoop(Worklist.pop_back_val());

+ while (!Worklist.empty()) {

+ Loop *L = Worklist.pop_back_val();

+ // For the inner loops we actually process, form LCSSA to simplify the

+ // transform.

+ Changed |= formLCSSARecursively(*L, *DT, LI, SE);

+ Changed |= processLoop(L);

+ }

// Process each loop nest in the function.

return Changed;

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 328f27002960..da3ac06ab464 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -39,6 +39,7 @@

#include "llvm/Pass.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

+#include "llvm/Support/GraphWriter.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Vectorize.h"

#include <algorithm>

@@ -90,6 +91,10 @@ static cl::opt<unsigned> MinTreeSize(

"slp-min-tree-size", cl::init(3), cl::Hidden,

cl::desc("Only vectorize small trees if they are fully vectorizable"));

+static cl::opt<bool>

+ ViewSLPTree("view-slp-tree", cl::Hidden,

+ cl::desc("Display the SLP trees with Graphviz"));

// Limit the number of alias checks. The limit is chosen so that

// it has no negative effect on the llvm benchmarks.

static const unsigned AliasedCheckLimit = 10;

@@ -212,14 +217,14 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {

/// Flag set: NSW, NUW, exact, and all of fast-math.

static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {

if (auto *VecOp = dyn_cast<Instruction>(I)) {

- if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {

- // Intersection is initialized to the 0th scalar,

- // so start counting from index '1'.

+ if (auto *I0 = dyn_cast<Instruction>(VL[0])) {

+ // VecOVp is initialized to the 0th scalar, so start counting from index

+ // '1'.

+ VecOp->copyIRFlags(I0);

for (int i = 1, e = VL.size(); i < e; ++i) {

if (auto *Scalar = dyn_cast<Instruction>(VL[i]))

- Intersection->andIRFlags(Scalar);

+ VecOp->andIRFlags(Scalar);

}

- VecOp->copyIRFlags(Intersection);

}

@@ -304,6 +309,8 @@ public:

typedef SmallVector<Instruction *, 16> InstrList;

typedef SmallPtrSet<Value *, 16> ValueSet;

typedef SmallVector<StoreInst *, 8> StoreList;

+ typedef MapVector<Value *, SmallVector<Instruction *, 2>>

+ ExtraValueToDebugLocsMap;

BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,

TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,

@@ -330,6 +337,10 @@ public:

/// \brief Vectorize the tree that starts with the elements in \p VL.

/// Returns the vectorized root.

Value *vectorizeTree();

+ /// Vectorize the tree but with the list of externally used values \p

+ /// ExternallyUsedValues. Values in this MapVector can be replaced but the

+ /// generated extractvalue instructions.

+ Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);

/// \returns the cost incurred by unwanted spills and fills, caused by

/// holding live values over call sites.

@@ -343,6 +354,13 @@ public:

/// the purpose of scheduling and extraction in the \p UserIgnoreLst.

void buildTree(ArrayRef<Value *> Roots,

ArrayRef<Value *> UserIgnoreLst = None);

+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for

+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking

+ /// into account (anf updating it, if required) list of externally used

+ /// values stored in \p ExternallyUsedValues.

+ void buildTree(ArrayRef<Value *> Roots,

+ ExtraValueToDebugLocsMap &ExternallyUsedValues,

+ ArrayRef<Value *> UserIgnoreLst = None);

/// Clear the internal data structures that are created by 'buildTree'.

void deleteTree() {

@@ -404,7 +422,7 @@ private:

int getEntryCost(TreeEntry *E);

/// This is the recursive part of buildTree.

- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);

+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);

/// \returns True if the ExtractElement/ExtractValue instructions in VL can

/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).

@@ -451,8 +469,9 @@ private:

SmallVectorImpl<Value *> &Left,

SmallVectorImpl<Value *> &Right);

struct TreeEntry {

- TreeEntry() : Scalars(), VectorizedValue(nullptr),

- NeedToGather(0) {}

+ TreeEntry(std::vector<TreeEntry> &Container)

+ : Scalars(), VectorizedValue(nullptr), NeedToGather(0),

+ Container(Container) {}

/// \returns true if the scalars in VL are equal to this entry.

bool isSame(ArrayRef<Value *> VL) const {

@@ -468,11 +487,24 @@ private:

/// Do we need to gather this sequence ?

bool NeedToGather;

+ /// Points back to the VectorizableTree.

+ ///

+ /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has

+ /// to be a pointer and needs to be able to initialize the child iterator.

+ /// Thus we need a reference back to the container to translate the indices

+ /// to entries.

+ std::vector<TreeEntry> &Container;

+ /// The TreeEntry index containing the user of this entry. We can actually

+ /// have multiple users so the data structure is not truly a tree.

+ SmallVector<int, 1> UserTreeIndices;

};

/// Create a new VectorizableTree entry.

- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {

- VectorizableTree.emplace_back();

+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,

+ int &UserTreeIdx) {

+ VectorizableTree.emplace_back(VectorizableTree);

int idx = VectorizableTree.size() - 1;

TreeEntry *Last = &VectorizableTree[idx];

Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

@@ -485,6 +517,10 @@ private:

} else {

MustGather.insert(VL.begin(), VL.end());

}

+ if (UserTreeIdx >= 0)

+ Last->UserTreeIndices.push_back(UserTreeIdx);

+ UserTreeIdx = idx;

return Last;

}

@@ -558,7 +594,9 @@ private:

SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;

/// A list of values that need to extracted out of the tree.

- /// This list holds pairs of (Internal Scalar : External User).

+ /// This list holds pairs of (Internal Scalar : External User). External User

+ /// can be nullptr, it means that this Internal Scalar will be used later,

+ /// after vectorization.

UserList ExternalUses;

/// Values used only by @llvm.assume calls.

@@ -706,6 +744,8 @@ private:

return os;

}

#endif

+ friend struct GraphTraits<BoUpSLP *>;

+ friend struct DOTGraphTraits<BoUpSLP *>;

/// Contains all scheduling data for a basic block.

///

@@ -916,17 +956,98 @@ private:

/// original width.

MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;

};

+} // end namespace slpvectorizer

+template <> struct GraphTraits<BoUpSLP *> {

+ typedef BoUpSLP::TreeEntry TreeEntry;

+ /// NodeRef has to be a pointer per the GraphWriter.

+ typedef TreeEntry *NodeRef;

+ /// \brief Add the VectorizableTree to the index iterator to be able to return

+ /// TreeEntry pointers.

+ struct ChildIteratorType

+ : public iterator_adaptor_base<ChildIteratorType,

+ SmallVector<int, 1>::iterator> {

+ std::vector<TreeEntry> &VectorizableTree;

+ ChildIteratorType(SmallVector<int, 1>::iterator W,

+ std::vector<TreeEntry> &VT)

+ : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

+ NodeRef operator*() { return &VectorizableTree[*I]; }

+ };

+ static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }

+ static ChildIteratorType child_begin(NodeRef N) {

+ return {N->UserTreeIndices.begin(), N->Container};

+ }

+ static ChildIteratorType child_end(NodeRef N) {

+ return {N->UserTreeIndices.end(), N->Container};

+ }

+ /// For the node iterator we just need to turn the TreeEntry iterator into a

+ /// TreeEntry* iterator so that it dereferences to NodeRef.

+ typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;

+ static nodes_iterator nodes_begin(BoUpSLP *R) {

+ return nodes_iterator(R->VectorizableTree.begin());

+ }

+ static nodes_iterator nodes_end(BoUpSLP *R) {

+ return nodes_iterator(R->VectorizableTree.end());

+ }

+ static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }

+};

+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

+ typedef BoUpSLP::TreeEntry TreeEntry;

+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

+ std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

+ std::string Str;

+ raw_string_ostream OS(Str);

+ if (isSplat(Entry->Scalars)) {

+ OS << "<splat> " << *Entry->Scalars[0];

+ return Str;

+ }

+ for (auto V : Entry->Scalars) {

+ OS << *V;

+ if (std::any_of(

+ R->ExternalUses.begin(), R->ExternalUses.end(),

+ [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))

+ OS << " <extract>";

+ OS << "\n";

+ }

+ return Str;

+ }

+ static std::string getNodeAttributes(const TreeEntry *Entry,

+ const BoUpSLP *) {

+ if (Entry->NeedToGather)

+ return "color=red";

+ return "";

+ }

+};

} // end namespace llvm

-} // end namespace slpvectorizer

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

ArrayRef<Value *> UserIgnoreLst) {

+ ExtraValueToDebugLocsMap ExternallyUsedValues;

+ buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);

+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

+ ExtraValueToDebugLocsMap &ExternallyUsedValues,

+ ArrayRef<Value *> UserIgnoreLst) {

deleteTree();

UserIgnoreList = UserIgnoreLst;

if (!allSameType(Roots))

return;

- buildTree_rec(Roots, 0);

+ buildTree_rec(Roots, 0, -1);

// Collect the values that we need to extract from the tree.

for (TreeEntry &EIdx : VectorizableTree) {

@@ -940,6 +1061,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

if (Entry->NeedToGather)

continue;

+ // Check if the scalar is externally used as an extra arg.

+ auto ExtI = ExternallyUsedValues.find(Scalar);

+ if (ExtI != ExternallyUsedValues.end()) {

+ DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<

+ Lane << " from " << *Scalar << ".\n");

+ ExternalUses.emplace_back(Scalar, nullptr, Lane);

+ continue;

+ }

for (User *U : Scalar->users()) {

DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");

@@ -976,28 +1105,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

}

-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

+ int UserTreeIdx) {

bool isAltShuffle = false;

assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");

if (Depth == RecursionMaxDepth) {

DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

// Don't handle vectors.

if (VL[0]->getType()->isVectorTy()) {

DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

if (SI->getValueOperand()->getType()->isVectorTy()) {

DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

unsigned Opcode = getSameOpcode(VL);

@@ -1014,7 +1143,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

// If all of the operands are identical or constant we have a simple solution.

if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {

DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1026,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (EphValues.count(VL[i])) {

DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<

") is ephemeral.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1039,10 +1168,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");

if (E->Scalars[i] != VL[i]) {

DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

+ // Record the reuse of the tree node. FIXME, currently this is only used to

+ // properly draw the graph rather than for the actual vectorization.

+ E->UserTreeIndices.push_back(UserTreeIdx);

DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");

return;

}

@@ -1052,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (ScalarToTreeEntry.count(VL[i])) {

DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<

") is already in tree.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1062,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (unsigned i = 0, e = VL.size(); i != e; ++i) {

if (MustGather.count(VL[i])) {

DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1076,7 +1208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

// Don't go into unreachable blocks. They may contain instructions with

// dependency cycles which confuse the final scheduling.

DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1085,7 +1217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (unsigned j = i+1; j < e; ++j)

if (VL[i] == VL[j]) {

DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1100,7 +1232,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

assert((!BS.getScheduleData(VL[0]) ||

!BS.getScheduleData(VL[0])->isPartOfBundle()) &&

"tryScheduleBundle should cancelScheduling on failure");

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");

@@ -1117,12 +1249,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (Term) {

DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

@@ -1132,7 +1264,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(

PH->getIncomingBlock(i)));

- buildTree_rec(Operands, Depth + 1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

@@ -1144,7 +1276,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

} else {

BS.cancelScheduling(VL);

}

- newTreeEntry(VL, Reuse);

+ newTreeEntry(VL, Reuse, UserTreeIdx);

return;

}

case Instruction::Load: {

@@ -1160,7 +1292,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (DL->getTypeSizeInBits(ScalarTy) !=

DL->getTypeAllocSizeInBits(ScalarTy)) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");

return;

}

@@ -1171,7 +1303,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

LoadInst *L = cast<LoadInst>(VL[i]);

if (!L->isSimple()) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");

return;

}

@@ -1193,7 +1325,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (Consecutive) {

++NumLoadsWantToKeepOrder;

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of loads.\n");

return;

}

@@ -1208,7 +1340,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

}

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

if (ReverseConsecutive) {

++NumLoadsWantToChangeOrder;

@@ -1235,12 +1367,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();

if (Ty != SrcTy || !isValidElementType(Ty)) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of casts.\n");

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

@@ -1249,7 +1381,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth+1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

@@ -1263,13 +1395,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (Cmp->getPredicate() != P0 ||

Cmp->getOperand(0)->getType() != ComparedTy) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of compares.\n");

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

@@ -1278,7 +1410,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth+1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

@@ -1301,7 +1433,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

case Instruction::And:

case Instruction::Or:

case Instruction::Xor: {

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of bin op.\n");

// Sort operands of the instructions so that each side is more likely to

@@ -1309,8 +1441,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

ValueList Left, Right;

reorderInputsAccordingToOpcode(VL, Left, Right);

- buildTree_rec(Left, Depth + 1);

- buildTree_rec(Right, Depth + 1);

+ buildTree_rec(Left, Depth + 1, UserTreeIdx);

+ buildTree_rec(Right, Depth + 1, UserTreeIdx);

return;

}

@@ -1320,7 +1452,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth+1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

@@ -1330,7 +1462,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (cast<Instruction>(VL[j])->getNumOperands() != 2) {

DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1343,7 +1475,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

if (Ty0 != CurTy) {

DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

@@ -1355,12 +1487,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

DEBUG(

dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");

for (unsigned i = 0, e = 2; i < e; ++i) {

ValueList Operands;

@@ -1368,7 +1500,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

@@ -1377,19 +1509,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)

if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Non-consecutive store.\n");

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a vector of stores.\n");

ValueList Operands;

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(0));

- buildTree_rec(Operands, Depth + 1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

return;

}

case Instruction::Call: {

@@ -1400,7 +1532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

if (!isTriviallyVectorizable(ID)) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");

return;

}

@@ -1414,7 +1546,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

!CI->hasIdenticalOperandBundleSchema(*CI2)) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]

<< "\n");

return;

@@ -1425,7 +1557,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

Value *A1J = CI2->getArgOperand(1);

if (A1I != A1J) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI

<< " argument "<< A1I<<"!=" << A1J

<< "\n");

@@ -1438,14 +1570,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

CI->op_begin() + CI->getBundleOperandsEndIndex(),

CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="

<< *VL[i] << '\n');

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

ValueList Operands;

// Prepare the operand vector.

@@ -1453,7 +1585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

CallInst *CI2 = dyn_cast<CallInst>(j);

Operands.push_back(CI2->getArgOperand(i));

}

- buildTree_rec(Operands, Depth + 1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

@@ -1462,19 +1594,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

// then do not vectorize this instruction.

if (!isAltShuffle) {

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");

return;

}

- newTreeEntry(VL, true);

+ newTreeEntry(VL, true, UserTreeIdx);

DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

// Reorder operands if reordering would enable vectorization.

if (isa<BinaryOperator>(VL0)) {

ValueList Left, Right;

reorderAltShuffleOperands(VL, Left, Right);

- buildTree_rec(Left, Depth + 1);

- buildTree_rec(Right, Depth + 1);

+ buildTree_rec(Left, Depth + 1, UserTreeIdx);

+ buildTree_rec(Right, Depth + 1, UserTreeIdx);

return;

}

@@ -1484,13 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

for (Value *j : VL)

Operands.push_back(cast<Instruction>(j)->getOperand(i));

- buildTree_rec(Operands, Depth + 1);

+ buildTree_rec(Operands, Depth + 1, UserTreeIdx);

}

return;

}

default:

BS.cancelScheduling(VL);

- newTreeEntry(VL, false);

+ newTreeEntry(VL, false, UserTreeIdx);

DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");

return;

}

@@ -1570,6 +1702,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

Type *ScalarTy = VL[0]->getType();

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

ScalarTy = SI->getValueOperand()->getType();

+ else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))

+ ScalarTy = CI->getOperand(0)->getType();

VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

// If we have computed a smaller type for the expression, update VecTy so

@@ -1599,7 +1733,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

int DeadCost = 0;

for (unsigned i = 0, e = VL.size(); i < e; ++i) {

Instruction *E = cast<Instruction>(VL[i]);

- if (E->hasOneUse())

+ // If all users are going to be vectorized, instruction can be

+ // considered as dead.

+ // The same, if have only one user, it will be vectorized for sure.

+ if (E->hasOneUse() ||

+ std::all_of(E->user_begin(), E->user_end(), [this](User *U) {

+ return ScalarToTreeEntry.count(U) > 0;

+ }))

// Take credit for instruction that will become dead.

DeadCost +=

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);

@@ -1624,10 +1764,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

// Calculate the cost of this instruction.

int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),

- VL0->getType(), SrcTy);

+ VL0->getType(), SrcTy, VL0);

VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());

- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);

+ int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);

return VecCost - ScalarCost;

}

case Instruction::FCmp:

@@ -1636,8 +1776,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

// Calculate the cost of this instruction.

VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());

int ScalarCost = VecTy->getNumElements() *

- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());

- int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);

+ TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);

+ int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);

return VecCost - ScalarCost;

}

case Instruction::Add:

@@ -1720,18 +1860,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

// Cost of wide load - cost of scalar loads.

unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();

int ScalarLdCost = VecTy->getNumElements() *

- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);

+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);

int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,

- VecTy, alignment, 0);

+ VecTy, alignment, 0, VL0);

return VecLdCost - ScalarLdCost;

}

case Instruction::Store: {

// We know that we can merge the stores. Calculate the cost.

unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();

int ScalarStCost = VecTy->getNumElements() *

- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);

+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);

int VecStCost = TTI->getMemoryOpCost(Instruction::Store,

- VecTy, alignment, 0);

+ VecTy, alignment, 0, VL0);

return VecStCost - ScalarStCost;

}

case Instruction::Call: {

@@ -1739,12 +1879,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

// Calculate the cost of the scalar and vector calls.

- SmallVector<Type*, 4> ScalarTys, VecTys;

- for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {

+ SmallVector<Type*, 4> ScalarTys;

+ for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)

ScalarTys.push_back(CI->getArgOperand(op)->getType());

- VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),

- VecTy->getNumElements()));

- }

FastMathFlags FMF;

if (auto *FPMO = dyn_cast<FPMathOperator>(CI))

@@ -1753,7 +1890,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

int ScalarCallCost = VecTy->getNumElements() *

TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);

- int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);

+ SmallVector<Value *, 4> Args(CI->arg_operands());

+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,

+ VecTy->getNumElements());

DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost

<< " (" << VecCallCost << "-" << ScalarCallCost << ")"

@@ -1947,9 +2086,18 @@ int BoUpSLP::getTreeCost() {

int SpillCost = getSpillCost();

Cost += SpillCost + ExtractCost;

- DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"

- << "SLP: Extract Cost = " << ExtractCost << ".\n"

- << "SLP: Total Cost = " << Cost << ".\n");

+ std::string Str;

+ {

+ raw_string_ostream OS(Str);

+ OS << "SLP: Spill Cost = " << SpillCost << ".\n"

+ << "SLP: Extract Cost = " << ExtractCost << ".\n"

+ << "SLP: Total Cost = " << Cost << ".\n";

+ }

+ DEBUG(dbgs() << Str);

+ if (ViewSLPTree)

+ ViewGraph(this, "SLP" + F->getName(), false, Str);

return Cost;

}

@@ -2702,6 +2850,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

}

Value *BoUpSLP::vectorizeTree() {

+ ExtraValueToDebugLocsMap ExternallyUsedValues;

+ return vectorizeTree(ExternallyUsedValues);

+Value *

+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

// All blocks must be scheduled before any instructions are inserted.

for (auto &BSIter : BlocksSchedules) {

@@ -2744,7 +2898,7 @@ Value *BoUpSLP::vectorizeTree() {

// Skip users that we already RAUW. This happens when one instruction

// has multiple uses of the same value.

- if (!is_contained(Scalar->users(), User))

+ if (User && !is_contained(Scalar->users(), User))

continue;

assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");

@@ -2756,6 +2910,28 @@ Value *BoUpSLP::vectorizeTree() {

assert(Vec && "Can't find vectorizable value");

Value *Lane = Builder.getInt32(ExternalUse.Lane);

+ // If User == nullptr, the Scalar is used as extra arg. Generate

+ // ExtractElement instruction and update the record for this scalar in

+ // ExternallyUsedValues.

+ if (!User) {

+ assert(ExternallyUsedValues.count(Scalar) &&

+ "Scalar with nullptr as an external user must be registered in "

+ "ExternallyUsedValues map");

+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {

+ Builder.SetInsertPoint(VecI->getParent(),

+ std::next(VecI->getIterator()));

+ } else {

+ Builder.SetInsertPoint(&F->getEntryBlock().front());

+ }

+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);

+ Ex = extend(ScalarRoot, Ex, Scalar->getType());

+ CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());

+ auto &Locs = ExternallyUsedValues[Scalar];

+ ExternallyUsedValues.insert({Ex, Locs});

+ ExternallyUsedValues.erase(Scalar);

+ continue;

+ }

// Generate extracts for out-of-tree users.

// Find the insertion point for the extractelement lane.

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

@@ -3264,7 +3440,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

// sorted by the original instruction location. This lets the final schedule

// be as close as possible to the original instruction order.

struct ScheduleDataCompare {

- bool operator()(ScheduleData *SD1, ScheduleData *SD2) {

+ bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {

return SD2->SchedulingPriority < SD1->SchedulingPriority;

}

};

@@ -3645,9 +3821,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A

bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);

if (!Changed)

return PreservedAnalyses::all();

PreservedAnalyses PA;

- PA.preserve<LoopAnalysis>();

- PA.preserve<DominatorTreeAnalysis>();

+ PA.preserveSet<CFGAnalyses>();

PA.preserve<AAManager>();

PA.preserve<GlobalsAA>();

return PA;

@@ -4026,36 +4202,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {

if (!V)

return false;

+ Value *P = V->getParent();

+ // Vectorize in current basic block only.

+ auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));

+ auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));

+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)

+ return false;

// Try to vectorize V.

- if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))

+ if (tryToVectorizePair(Op0, Op1, R))

return true;

- BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));

- BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));

+ auto *A = dyn_cast<BinaryOperator>(Op0);

+ auto *B = dyn_cast<BinaryOperator>(Op1);

// Try to skip B.

if (B && B->hasOneUse()) {

- BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

- BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

- if (tryToVectorizePair(A, B0, R)) {

+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

+ if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))

return true;

- }

- if (tryToVectorizePair(A, B1, R)) {

+ if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))

return true;

- }

}

// Try to skip A.

if (A && A->hasOneUse()) {

- BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

- BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

- if (tryToVectorizePair(A0, B, R)) {

+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

+ if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))

return true;

- }

- if (tryToVectorizePair(A1, B, R)) {

+ if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))

return true;

- }

}

- return 0;

+ return false;

}

/// \brief Generate a shuffle mask to be used in a reduction tree.

@@ -4119,37 +4299,41 @@ namespace {

class HorizontalReduction {

SmallVector<Value *, 16> ReductionOps;

SmallVector<Value *, 32> ReducedVals;

+ // Use map vector to make stable output.

+ MapVector<Instruction *, Value *> ExtraArgs;

- BinaryOperator *ReductionRoot;

- // After successfull horizontal reduction vectorization attempt for PHI node

- // vectorizer tries to update root binary op by combining vectorized tree and

- // the ReductionPHI node. But during vectorization this ReductionPHI can be

- // vectorized itself and replaced by the undef value, while the instruction

- // itself is marked for deletion. This 'marked for deletion' PHI node then can

- // be used in new binary operation, causing "Use still stuck around after Def

- // is destroyed" crash upon PHI node deletion.

- WeakVH ReductionPHI;

+ BinaryOperator *ReductionRoot = nullptr;

/// The opcode of the reduction.

- unsigned ReductionOpcode;

+ Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;

/// The opcode of the values we perform a reduction on.

- unsigned ReducedValueOpcode;

+ unsigned ReducedValueOpcode = 0;

/// Should we model this reduction as a pairwise reduction tree or a tree that

/// splits the vector in halves and adds those halves.

- bool IsPairwiseReduction;

+ bool IsPairwiseReduction = false;

+ /// Checks if the ParentStackElem.first should be marked as a reduction

+ /// operation with an extra argument or as extra argument itself.

+ void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,

+ Value *ExtraArg) {

+ if (ExtraArgs.count(ParentStackElem.first)) {

+ ExtraArgs[ParentStackElem.first] = nullptr;

+ // We ran into something like:

+ // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.

+ // The whole ParentStackElem.first should be considered as an extra value

+ // in this case.

+ // Do not perform analysis of remaining operands of ParentStackElem.first

+ // instruction, this whole instruction is an extra argument.

+ ParentStackElem.second = ParentStackElem.first->getNumOperands();

+ } else {

+ // We ran into something like:

+ // ParentStackElem.first += ... + ExtraArg + ...

+ ExtraArgs[ParentStackElem.first] = ExtraArg;

+ }

public:

- /// The width of one full horizontal reduction operation.

- unsigned ReduxWidth;

- /// Minimal width of available vector registers. It's used to determine

- /// ReduxWidth.

- unsigned MinVecRegSize;

- HorizontalReduction(unsigned MinVecRegSize)

- : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),

- IsPairwiseReduction(false), ReduxWidth(0),

- MinVecRegSize(MinVecRegSize) {}

+ HorizontalReduction() = default;

/// \brief Try to find a reduction tree.

bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {

@@ -4176,21 +4360,14 @@ public:

if (!isValidElementType(Ty))

return false;

- const DataLayout &DL = B->getModule()->getDataLayout();

ReductionOpcode = B->getOpcode();

ReducedValueOpcode = 0;

- // FIXME: Register size should be a parameter to this function, so we can

- // try different vectorization factors.

- ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);

ReductionRoot = B;

- ReductionPHI = Phi;

- if (ReduxWidth < 4)

- return false;

// We currently only support adds.

- if (ReductionOpcode != Instruction::Add &&

- ReductionOpcode != Instruction::FAdd)

+ if ((ReductionOpcode != Instruction::Add &&

+ ReductionOpcode != Instruction::FAdd) ||

+ !B->isAssociative())

return false;

// Post order traverse the reduction tree starting at B. We only handle true

@@ -4202,30 +4379,26 @@ public:

unsigned EdgeToVist = Stack.back().second++;

bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;

- // Only handle trees in the current basic block.

- if (TreeN->getParent() != B->getParent())

- return false;

- // Each tree node needs to have one user except for the ultimate

- // reduction.

- if (!TreeN->hasOneUse() && TreeN != B)

- return false;

// Postorder vist.

if (EdgeToVist == 2 || IsReducedValue) {

- if (IsReducedValue) {

- // Make sure that the opcodes of the operations that we are going to

- // reduce match.

- if (!ReducedValueOpcode)

- ReducedValueOpcode = TreeN->getOpcode();

- else if (ReducedValueOpcode != TreeN->getOpcode())

- return false;

+ if (IsReducedValue)

ReducedVals.push_back(TreeN);

- } else {

- // We need to be able to reassociate the adds.

- if (!TreeN->isAssociative())

- return false;

- ReductionOps.push_back(TreeN);

+ else {

+ auto I = ExtraArgs.find(TreeN);

+ if (I != ExtraArgs.end() && !I->second) {

+ // Check if TreeN is an extra argument of its parent operation.

+ if (Stack.size() <= 1) {

+ // TreeN can't be an extra argument as it is a root reduction

+ // operation.

+ return false;

+ }

+ // Yes, TreeN is an extra argument, do not add it to a list of

+ // reduction operations.

+ // Stack[Stack.size() - 2] always points to the parent operation.

+ markExtraArg(Stack[Stack.size() - 2], TreeN);

+ ExtraArgs.erase(TreeN);

+ } else

+ ReductionOps.push_back(TreeN);

}

// Retract.

Stack.pop_back();

@@ -4242,13 +4415,44 @@ public:

// reduced value class.

if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||

I->getOpcode() == ReductionOpcode)) {

- if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)

+ // Only handle trees in the current basic block.

+ if (I->getParent() != B->getParent()) {

+ // I is an extra argument for TreeN (its parent operation).

+ markExtraArg(Stack.back(), I);

+ continue;

+ }

+ // Each tree node needs to have one user except for the ultimate

+ // reduction.

+ if (!I->hasOneUse() && I != B) {

+ // I is an extra argument for TreeN (its parent operation).

+ markExtraArg(Stack.back(), I);

+ continue;

+ }

+ if (I->getOpcode() == ReductionOpcode) {

+ // We need to be able to reassociate the reduction operations.

+ if (!I->isAssociative()) {

+ // I is an extra argument for TreeN (its parent operation).

+ markExtraArg(Stack.back(), I);

+ continue;

+ }

+ } else if (ReducedValueOpcode &&

+ ReducedValueOpcode != I->getOpcode()) {

+ // Make sure that the opcodes of the operations that we are going to

+ // reduce match.

+ // I is an extra argument for TreeN (its parent operation).

+ markExtraArg(Stack.back(), I);

+ continue;

+ } else if (!ReducedValueOpcode)

ReducedValueOpcode = I->getOpcode();

Stack.push_back(std::make_pair(I, 0));

continue;

}

- return false;

}

+ // NextV is an extra argument for TreeN (its parent operation).

+ markExtraArg(Stack.back(), NextV);

}

return true;

}

@@ -4259,10 +4463,15 @@ public:

if (ReducedVals.empty())

return false;

+ // If there is a sufficient number of reduction values, reduce

+ // to a nearby power-of-2. Can safely generate oversized

+ // vectors and rely on the backend to split them to legal sizes.

unsigned NumReducedVals = ReducedVals.size();

- if (NumReducedVals < ReduxWidth)

+ if (NumReducedVals < 4)

return false;

+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);

Value *VectorizedTree = nullptr;

IRBuilder<> Builder(ReductionRoot);

FastMathFlags Unsafe;

@@ -4270,20 +4479,26 @@ public:

Builder.setFastMathFlags(Unsafe);

unsigned i = 0;

- for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {

+ BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

+ // The same extra argument may be used several time, so log each attempt

+ // to use it.

+ for (auto &Pair : ExtraArgs)

+ ExternallyUsedValues[Pair.second].push_back(Pair.first);

+ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {

auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);

- V.buildTree(VL, ReductionOps);

+ V.buildTree(VL, ExternallyUsedValues, ReductionOps);

if (V.shouldReorder()) {

SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());

- V.buildTree(Reversed, ReductionOps);

+ V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);

}

if (V.isTreeTinyAndNotFullyVectorizable())

- continue;

+ break;

V.computeMinimumValueSizes();

// Estimate cost.

- int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);

+ int Cost =

+ V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);

if (Cost >= -SLPCostThreshold)

break;

@@ -4292,33 +4507,44 @@ public:

// Vectorize a tree.

DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();

- Value *VectorizedRoot = V.vectorizeTree();

+ Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

// Emit a reduction.

- Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);

+ Value *ReducedSubTree =

+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);

if (VectorizedTree) {

Builder.SetCurrentDebugLocation(Loc);

- VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,

- ReducedSubTree, "bin.rdx");

+ VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,

+ ReducedSubTree, "bin.rdx");

+ propagateIRFlags(VectorizedTree, ReductionOps);

} else

VectorizedTree = ReducedSubTree;

+ i += ReduxWidth;

+ ReduxWidth = PowerOf2Floor(NumReducedVals - i);

}

if (VectorizedTree) {

// Finish the reduction.

for (; i < NumReducedVals; ++i) {

- Builder.SetCurrentDebugLocation(

- cast<Instruction>(ReducedVals[i])->getDebugLoc());

- VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,

- ReducedVals[i]);

+ auto *I = cast<Instruction>(ReducedVals[i]);

+ Builder.SetCurrentDebugLocation(I->getDebugLoc());

+ VectorizedTree =

+ Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);

+ propagateIRFlags(VectorizedTree, ReductionOps);

+ }

+ for (auto &Pair : ExternallyUsedValues) {

+ assert(!Pair.second.empty() &&

+ "At least one DebugLoc must be inserted");

+ // Add each externally used value to the final reduction.

+ for (auto *I : Pair.second) {

+ Builder.SetCurrentDebugLocation(I->getDebugLoc());

+ VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,

+ Pair.first, "bin.extra");

+ propagateIRFlags(VectorizedTree, I);

+ }

}

// Update users.

- if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {

- assert(ReductionRoot && "Need a reduction operation");

- ReductionRoot->setOperand(0, VectorizedTree);

- ReductionRoot->setOperand(1, ReductionPHI);

- } else

- ReductionRoot->replaceAllUsesWith(VectorizedTree);

+ ReductionRoot->replaceAllUsesWith(VectorizedTree);

}

return VectorizedTree != nullptr;

}

@@ -4329,7 +4555,8 @@ public:

private:

/// \brief Calculate the cost of a reduction.

- int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {

+ int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,

+ unsigned ReduxWidth) {

Type *ScalarTy = FirstReducedVal->getType();

Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);

@@ -4352,15 +4579,9 @@ private:

return VecReduxCost - ScalarReduxCost;

}

- static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,

- Value *R, const Twine &Name = "") {

- if (Opcode == Instruction::FAdd)

- return Builder.CreateFAdd(L, R, Name);

- return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);

- }

/// \brief Emit a horizontal reduction of the vectorized value.

- Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {

+ Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,

+ unsigned ReduxWidth, ArrayRef<Value *> RedOps) {

assert(VectorizedValue && "Need to have a vectorized tree node");

assert(isPowerOf2_32(ReduxWidth) &&

"We only handle power-of-two reductions for now");

@@ -4378,15 +4599,16 @@ private:

Value *RightShuf = Builder.CreateShuffleVector(

TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),

"rdx.shuf.r");

- TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,

- "bin.rdx");

+ TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,

+ "bin.rdx");

} else {

Value *UpperHalf =

createRdxShuffleMask(ReduxWidth, i, false, false, Builder);

Value *Shuf = Builder.CreateShuffleVector(

TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");

- TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");

+ TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");

}

+ propagateIRFlags(TmpVec, RedOps);

}

// The result is in the first element of the vector.

@@ -4438,16 +4660,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,

static bool findBuildAggregate(InsertValueInst *IV,

SmallVectorImpl<Value *> &BuildVector,

SmallVectorImpl<Value *> &BuildVectorOpds) {

- if (!IV->hasOneUse())

- return false;

- Value *V = IV->getAggregateOperand();

- if (!isa<UndefValue>(V)) {

- InsertValueInst *I = dyn_cast<InsertValueInst>(V);

- if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))

+ Value *V;

+ do {

+ BuildVector.push_back(IV);

+ BuildVectorOpds.push_back(IV->getInsertedValueOperand());

+ V = IV->getAggregateOperand();

+ if (isa<UndefValue>(V))

+ break;

+ IV = dyn_cast<InsertValueInst>(V);

+ if (!IV || !IV->hasOneUse())

return false;

- }

- BuildVector.push_back(IV);

- BuildVectorOpds.push_back(IV->getInsertedValueOperand());

+ } while (true);

+ std::reverse(BuildVector.begin(), BuildVector.end());

+ std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());

return true;

}

@@ -4507,29 +4732,137 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,

return nullptr;

}

+namespace {

+/// Tracks instructons and its children.

+class WeakVHWithLevel final : public CallbackVH {

+ /// Operand index of the instruction currently beeing analized.

+ unsigned Level = 0;

+ /// Is this the instruction that should be vectorized, or are we now

+ /// processing children (i.e. operands of this instruction) for potential

+ /// vectorization?

+ bool IsInitial = true;

+public:

+ explicit WeakVHWithLevel() = default;

+ WeakVHWithLevel(Value *V) : CallbackVH(V){};

+ /// Restart children analysis each time it is repaced by the new instruction.

+ void allUsesReplacedWith(Value *New) override {

+ setValPtr(New);

+ Level = 0;

+ IsInitial = true;

+ }

+ /// Check if the instruction was not deleted during vectorization.

+ bool isValid() const { return !getValPtr(); }

+ /// Is the istruction itself must be vectorized?

+ bool isInitial() const { return IsInitial; }

+ /// Try to vectorize children.

+ void clearInitial() { IsInitial = false; }

+ /// Are all children processed already?

+ bool isFinal() const {

+ assert(getValPtr() &&

+ (isa<Instruction>(getValPtr()) &&

+ cast<Instruction>(getValPtr())->getNumOperands() >= Level));

+ return getValPtr() &&

+ cast<Instruction>(getValPtr())->getNumOperands() == Level;

+ }

+ /// Get next child operation.

+ Value *nextOperand() {

+ assert(getValPtr() && isa<Instruction>(getValPtr()) &&

+ cast<Instruction>(getValPtr())->getNumOperands() > Level);

+ return cast<Instruction>(getValPtr())->getOperand(Level++);

+ }

+ virtual ~WeakVHWithLevel() = default;

+};

+} // namespace

/// \brief Attempt to reduce a horizontal reduction.

/// If it is legal to match a horizontal reduction feeding

-/// the phi node P with reduction operators BI, then check if it

-/// can be done.

+/// the phi node P with reduction operators Root in a basic block BB, then check

+/// if it can be done.

/// \returns true if a horizontal reduction was matched and reduced.

/// \returns false if a horizontal reduction was not matched.

-static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,

- BoUpSLP &R, TargetTransformInfo *TTI,

- unsigned MinRegSize) {

+static bool canBeVectorized(

+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,

+ TargetTransformInfo *TTI,

+ const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {

if (!ShouldVectorizeHor)

return false;

- HorizontalReduction HorRdx(MinRegSize);

- if (!HorRdx.matchAssociativeReduction(P, BI))

+ if (!Root)

return false;

- // If there is a sufficient number of reduction values, reduce

- // to a nearby power-of-2. Can safely generate oversized

- // vectors and rely on the backend to split them to legal sizes.

- HorRdx.ReduxWidth =

- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));

+ if (Root->getParent() != BB)

+ return false;

+ SmallVector<WeakVHWithLevel, 8> Stack(1, Root);

+ SmallSet<Value *, 8> VisitedInstrs;

+ bool Res = false;

+ while (!Stack.empty()) {

+ Value *V = Stack.back();

+ if (!V) {

+ Stack.pop_back();

+ continue;

+ }

+ auto *Inst = dyn_cast<Instruction>(V);

+ if (!Inst || isa<PHINode>(Inst)) {

+ Stack.pop_back();

+ continue;

+ }

+ if (Stack.back().isInitial()) {

+ Stack.back().clearInitial();

+ if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {

+ HorizontalReduction HorRdx;

+ if (HorRdx.matchAssociativeReduction(P, BI)) {

+ if (HorRdx.tryToReduce(R, TTI)) {

+ Res = true;

+ P = nullptr;

+ continue;

+ }

+ if (P) {

+ Inst = dyn_cast<Instruction>(BI->getOperand(0));

+ if (Inst == P)

+ Inst = dyn_cast<Instruction>(BI->getOperand(1));

+ if (!Inst) {

+ P = nullptr;

+ continue;

+ }

+ P = nullptr;

+ if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {

+ Res = true;

+ continue;

+ }

+ if (Stack.back().isFinal()) {

+ Stack.pop_back();

+ continue;

+ }

- return HorRdx.tryToReduce(R, TTI);

+ if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))

+ if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&

+ Stack.size() < RecursionMaxDepth)

+ Stack.push_back(NextV);

+ }

+ return Res;

+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

+ BasicBlock *BB, BoUpSLP &R,

+ TargetTransformInfo *TTI) {

+ if (!V)

+ return false;

+ auto *I = dyn_cast<Instruction>(V);

+ if (!I)

+ return false;

+ if (!isa<BinaryOperator>(I))

+ P = nullptr;

+ // Try to match and vectorize a horizontal reduction.

+ return canBeVectorized(P, I, BB, R, TTI,

+ [this](BinaryOperator *BI, BoUpSLP &R) -> bool {

+ return tryToVectorize(BI, R);

+ });

}

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

@@ -4599,67 +4932,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

if (P->getNumIncomingValues() != 2)

return Changed;

- Value *Rdx = getReductionValue(DT, P, BB, LI);

- // Check if this is a Binary Operator.

- BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);

- if (!BI)

- continue;

// Try to match and vectorize a horizontal reduction.

- if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {

+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,

+ TTI)) {

Changed = true;

it = BB->begin();

e = BB->end();

continue;

}

- Value *Inst = BI->getOperand(0);

- if (Inst == P)

- Inst = BI->getOperand(1);

- if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {

- // We would like to start over since some instructions are deleted

- // and the iterator may become invalid value.

- Changed = true;

- it = BB->begin();

- e = BB->end();

- continue;

- }

continue;

}

- if (ShouldStartVectorizeHorAtStore)

- if (StoreInst *SI = dyn_cast<StoreInst>(it))

- if (BinaryOperator *BinOp =

- dyn_cast<BinaryOperator>(SI->getValueOperand())) {

- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,

- R.getMinVecRegSize()) ||

- tryToVectorize(BinOp, R)) {

- Changed = true;

- it = BB->begin();

- e = BB->end();

- continue;

- }

+ if (ShouldStartVectorizeHorAtStore) {

+ if (StoreInst *SI = dyn_cast<StoreInst>(it)) {

+ // Try to match and vectorize a horizontal reduction.

+ if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,

+ TTI)) {

+ Changed = true;

+ it = BB->begin();

+ e = BB->end();

+ continue;

}

+ }

// Try to vectorize horizontal reductions feeding into a return.

- if (ReturnInst *RI = dyn_cast<ReturnInst>(it))

- if (RI->getNumOperands() != 0)

- if (BinaryOperator *BinOp =

- dyn_cast<BinaryOperator>(RI->getOperand(0))) {

- DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");

- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,

- R.getMinVecRegSize()) ||

- tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),

- R)) {

- Changed = true;

- it = BB->begin();

- e = BB->end();

- continue;

- }

+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {

+ if (RI->getNumOperands() != 0) {

+ // Try to match and vectorize a horizontal reduction.

+ if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {

+ Changed = true;

+ it = BB->begin();

+ e = BB->end();

+ continue;

}

+ }

// Try to vectorize trees that start at compare instructions.

if (CmpInst *CI = dyn_cast<CmpInst>(it)) {

@@ -4672,16 +4980,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

continue;

}

- for (int i = 0; i < 2; ++i) {

- if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {

- if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {

- Changed = true;

- // We would like to start over since some instructions are deleted

- // and the iterator may become invalid value.

- it = BB->begin();

- e = BB->end();

- break;

- }

+ for (int I = 0; I < 2; ++I) {

+ if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {

+ Changed = true;

+ // We would like to start over since some instructions are deleted

+ // and the iterator may become invalid value.

+ it = BB->begin();

+ e = BB->end();

+ break;

}

continue;